In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from plotly.figure_factory import create_gantt
from sklearn.preprocessing import LabelEncoder
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import os, re, gc 

from io import StringIO
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


1. Given https://github.com/yutkin/Lenta.Ru-News-Dataset, perform EDA on it focusing on the following:
  - Provide descriptive statistics
  - Anomaly detection

In [None]:
filename = '/kaggle/input/corpus-of-russian-news-articles-from-lenta/lenta-ru-news.csv'
df = pd.read_csv(filename)
df.head(10)

In [None]:
df.info()

Convert object to datatime64 and round the date to the day.

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.floor('D')

We plot the number of news for each year . 

In [None]:
countNewsPerYear = df.groupby(df['date'].dt.year).size()
plt.figure(figsize=(12, 8))
fig = countNewsPerYear.plot(kind='bar')
fig.set_title('TOTAL NUMBER OF NEWS ARTICLES')
fig.set_xlabel('YEAR')
fig.set_ylabel('NUMBER OF NEWS ARTICLES')

We found something intresting in 1914 year. Let's check it by clicking on the link

In [None]:
df.loc[df['date'].dt.year == 1914]

That news was written in 2014. We should change the date for them.

In [None]:
df['date'][:5] = df['date'][:5] + pd.offsets.DateOffset(year=2014)

df = df.sort_values(['date'], ascending=True)
df = df.reset_index(drop=True)

Percentages of nans for topic and text

In [None]:
amountOfNans = df.iloc[:, 2:4].isna().sum() 
amountOfNans.sort_values(ascending = False ) / df.shape[0] * 100

We must drop Nans in topic and text beacuse in the future, we will create a model for extracting topic from text. Without text or topic we can't train the model

In [None]:
df = df[df['topic'].notna()]
df = df[df['text'].notna()]

#refresh the indexes
df = df.set_index(np.arange(len(df.index)))

We have found that the word 'Культпросвет ' has an extra space.

In [None]:
#Remove extra space from 'Культпросвет '
df_topic = []

for i in df['topic']:
    if i == 'Культпросвет ':
        df_topic.append('Культпросвет')
    else:
        df_topic.append(i)

df['topic'] = df_topic

del df_topic
gc.collect() 

We check the appearance of topics to understand how important the topic is. Then shorter the life span of the topic than less important it is.

In [None]:
# Unique topic names
nameOfTopics = df['topic'].unique()

df_dict = []
for i in nameOfTopics:      
    serie = df[df['topic'] == i]   
    # add first, last date appearance for each topic
    df_dict.append(dict(Task=i, Start=serie.iloc[0, 5], Finish=serie.iloc[-1, 5]))
    
fig = create_gantt(df_dict, title='The date appearance of topics', height=600, bar_width=0.5, width=600)
fig.show()

del df_dict
gc.collect() 

We can see Медновости, Сочи, ЧМ-2014 and Библиотека are less important then others

Now we plot the number of news for each topic. Then more number of topics than more important the topic is

In [None]:
countNewsPerTopic = df.groupby(df['topic']).size()
countNewsPerTopic = countNewsPerTopic.sort_values(ascending = False)

plt.figure(figsize=(12, 8))
fig = countNewsPerTopic.plot(kind='bar')
fig.set_title('TOTAL NUMBER OF NEWS ARTICLES')
fig.set_xlabel('TOPIC')
fig.set_ylabel('NUMBER OF NEWS ARTICLES')

How we can see the dataset is unbalanced and some topics seems nearly zero. We need to check how many percent of rare topics is in the entire dataset. We need to know this, because in the future we will create a model for extracting the topic from the text.Then a larger number of topics, than greater the chance that the model may make mistakes

In [None]:
rareTopics = ['Крым','Культпросвет', 'Легпром', 'Библиотека', 'Оружие', 'ЧМ-2014', 'Сочи', 'МедНовости', '69-я параллель'] 
percOfRareTopic = sum((df['topic'].isin(rareTopics)))/ len(df['topic']) * 100

print(f'{percOfRareTopic:.3f}% for rare topics')

In [None]:
mask = np.logical_not(df['topic'].isin(rareTopics))
df = df[mask]


del mask
gc.collect() 

In [None]:
#refresh the indexes
df = df.set_index(np.arange(len(df.index)))
#Remove url and date from the dataset
df = df.drop(['url', 'date'], axis = 1)

In [None]:
df.to_csv('textFromEDA.csv', index = False)

# Metrics
We should check two metrics: Logistic Loss, F1 Score. These two metrics are well suited to our task.
Logistic Loss metric considers confidence in a particular class. In F1 Score, it is necessary that the precision and recall are equal to one and it is close to zero if one of the arguments is close to zero. Accuracy is not a good metric because we have an unbalanced dataset.

In [None]:
preprocFile = '/kaggle/input/a-job-project/preprocess_text2.csv'

new_df = pd.read_csv(preprocFile)