In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
import missingno as msn

from wordcloud import WordCloud

from sklearn.model_selection import train_test_split

import warnings
from tqdm.notebook import tqdm

from src import utils

In [None]:
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
raw_data = pd.read_csv("./data/lenta-ru-news.csv", low_memory=False, parse_dates=["date"])
raw_data.head()

In [None]:
raw_data.shape

In [None]:
raw_data.dtypes

# EDA

## Count articles per year

In [None]:
articles_per_year = (
    raw_data
    .assign(year=lambda x: x["date"].dt.year)
    ["year"]
    .value_counts()
    .reset_index()
)

In [None]:
plt.figure(figsize=(12, 7))
sns.barplot(data=articles_per_year, x="index", y="year")
plt.title("Количество статей по годам")
plt.xticks(rotation=45)
plt.xlabel("Год")
plt.ylabel("Количество")
plt.grid(axis="x")

plt.savefig('./images/articles_per_year', bbox_inches='tight')

## Count articles per topic

In [None]:
articles_per_topic = (
    raw_data
    ["topic"]
    .value_counts()
    .reset_index()
)

In [None]:
plt.figure(figsize=(12, 7))
sns.barplot(data=articles_per_topic, x="index", y="topic")
plt.title("Количество статей по темам")
plt.xticks(rotation=45)
plt.xlabel("Тема")
plt.ylabel("Количество")
plt.grid(axis="x")
plt.savefig('./images/articles_per_topic', bbox_inches='tight')

## Count articles per tag

In [None]:
articles_per_tag = (
    raw_data
    ["tags"]
    .value_counts()
    .reset_index()
    .values
)

articles_per_tag = dict(zip(articles_per_tag[:, 0], articles_per_tag[:, 1]))
articles_per_tag

In [None]:
wordcloud = WordCloud(max_font_size=50, background_color="white").generate_from_frequencies(articles_per_tag)

plt.figure(figsize=(8, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

plt.savefig('./images/articles_per_tag', bbox_inches='tight')

# Preprocessing

## Nan values processing

In [None]:
msn.matrix(df=raw_data)
plt.savefig('./images/missing_valies', bbox_inches='tight')

In [None]:
data = raw_data.copy()

In [None]:
data = data.dropna()

## Topics filter

In [None]:
data["topic"].value_counts()

In [None]:
filtered_data = data[data["topic"].isin(data["topic"].value_counts()[data["topic"].value_counts() >= 40_000].index)].reset_index()

In [None]:
filtered_data.shape

## Class balancing

In [None]:
filtered_data.topic.value_counts()

In [None]:
data_to_get = (
    filtered_data
    .groupby("topic")
    .apply(lambda x: x.sample(filtered_data.groupby("topic").size().min()))
    .reset_index(drop=True)
)

In [None]:
data_to_get.shape

## 350k is too much. Select ~150k

In [None]:
_, final_data = train_test_split(data_to_get,
                                 test_size=0.42,
                                 shuffle=True,
                                 stratify=data_to_get["topic"])

## Topic category to int

In [None]:
final_data["topic"] = (
    final_data["topic"]
    .astype("category")
    .cat.codes
)

In [None]:
final_data.topic.value_counts()

## Combine, select columns and preprocces text column

In [None]:
final_data = (
    final_data
    .assign(combined_text=lambda x: x["title"] + ". " + x["text"])
    [["combined_text", "topic"]]
)

In [None]:
final_data["preprocessed_text"] = (
    final_data
    .progress_apply(lambda x: utils.text_processing(x["combined_text"], first_n_words=200), axis=1)
)

## Saving data

In [None]:
final_data.to_csv("./data/preprocessed_data_200.csv", index=False)