In [None]:
from collections import Counter
from datetime import datetime
import re
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
news = pd.read_csv('/kaggle/input/news.csv')

In [None]:
news.head()

Dataset is based on four news sites

In [None]:
plt.hist(news['source'])
news['source'].value_counts()

# ria.ru

In [None]:
ria = news[news.source == 'ria.ru']
ria.head()

## News example

In [None]:
print(ria.iloc[0].title)
print(ria.iloc[0].text)

## Article length

In [None]:
print(f'Mean length: {ria.text.str.len().mean()}')
plt.figure(figsize=(20, 5))
sns.distplot(ria.text.str.len())

## Tag distribution

In [None]:
tags = []
for i in ria.tags.dropna():
    tags += i.split(', ')
tag_counter = Counter(tags)

### Tag count

In [None]:
len(tag_counter)

### Top 20 tags

In [None]:
tag_counter.most_common()[:20]

# lenta.ru

In [None]:
lenta = news[news.source == 'lenta.ru']
lenta.head()

## News example

In [None]:
print(lenta.iloc[1].title)
print(lenta.iloc[1].text)

## Article length

In [None]:
print(f'Mean length: {lenta.text.str.len().mean()}')
plt.figure(figsize=(20, 5))
sns.distplot(lenta.text.str.len())

## First and last dates of publication

In [None]:
print(f'First date: {lenta.publication_date.min()}')
print(f'Last date: {lenta.publication_date.max()}')

## Distribution of articles by rubrics

In [None]:
plt.figure(figsize=(10, 10))
lenta['rubric'].value_counts().plot.barh().invert_yaxis()
lenta['rubric'].value_counts()

## Distribution of articles by subrubrics

In [None]:
plt.figure(figsize=(10, 20))
lenta['subrubric'].value_counts().plot.barh().invert_yaxis()
lenta['subrubric'].value_counts()

# meduza.io

In [None]:
meduza = news[news.source == 'meduza.io']
meduza.head()

## News example

In [None]:
print(meduza.iloc[1].title)
print(meduza.iloc[1].text)

## Article length

In [None]:
print(f'Mean length: {meduza.text.str.len().mean()}')
plt.figure(figsize=(20, 5))
sns.distplot(meduza.text.str.len())

# tjournal.ru

In [None]:
tjournal = news[news.source == 'tjournal.ru']
tjournal.head()

## News example

In [None]:
print(tjournal.iloc[1].title.strip())
print(tjournal.iloc[1].text.replace('\n', '').strip())

## Article length

In [None]:
print(f'Mean length: {tjournal.text.str.len().mean()}')
plt.figure(figsize=(20, 5))
sns.distplot(tjournal.text.str.len())

## First and last dates of publication

In [None]:
def convert_from_unix_time(ts):
    return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
print(f'First date: {convert_from_unix_time(int(tjournal.publication_date.min()))}')
print(f'Last date: {convert_from_unix_time(int(tjournal.publication_date.max()))}')

## Tag distribution

In [None]:
tags = []
for i in tjournal.text:
    tags += re.findall(r'#\w+', i)
tag_counter = Counter(tags)

### Tag count

In [None]:
len(tag_counter)

## Top 20 tags

In [None]:
tag_counter.most_common()[:20]