In [1]:
import morfeusz2
import pandas as pd

In [2]:
m = morfeusz2.Morfeusz()

In [3]:
articles = pd.read_csv('labeling/fresh_data.csv')\
    [['id', 'title', 'highlight', 'content', 'media_desc']]\

In [4]:
articles.head(1)

Unnamed: 0,id,title,highlight,content,media_desc
0,24760624,cyrk zwierzetami gdansku mieszkancy oburzeni b...,mieszkancy osowej chca oknami wystepowal cyrk ...,wtorek gdansku wystapil cyrk zalewski udzialem...,


In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 5 columns):
id            1090 non-null int64
title         1089 non-null object
highlight     1089 non-null object
content       1090 non-null object
media_desc    976 non-null object
dtypes: int64(1), object(4)
memory usage: 42.7+ KB


In [6]:
from unidecode import unidecode

In [7]:
def lemmatize_column(column):
    lemmatized_column = [] 
    for record in column.values:
        lemmatized_row = []
        try:
            for word in record.split(' '):
                lemmatized_row.append(unidecode(m.analyse(word)[0][2][1].split(':')[0]).lower())
            lemmatized_column.append(" ".join(lemmatized_row))
        except:
            lemmatized_column.append(None)
        
    return pd.Series(lemmatized_column, name = column.name)

In [8]:
lem_df_list = []
for column in [u'content', u'highlight', u'media_desc', u'title']:
    lem_df_list.append(lemmatize_column(articles[column]))
lem_df_list.append(articles['id'])

In [9]:
lem_df = pd.concat(lem_df_list, axis = 1)

In [10]:
lem_df.head(5)

Unnamed: 0,content,highlight,media_desc,title,id
0,wtorek gdansku wystapil cyrk zalewski udzialem...,mieszkancy osowa chca okno wystepowal cyrk zwi...,,cyrk zwierzetami gdansku mieszkancy oburzony b...,24760624
1,decyzja rzecznik oznaczac lekarz krzysztof pon...,natalia zaczela rodzic tydzien ciazy porod dzi...,wczesniak inkubator gramowy nadia dany trafic ...,matka obwiniac ginekolog smierc coreczki izba ...,24788214
2,sobotni wybory australijczyk wybierac czlonkow...,wierzylem cud powiedzial ogloszeniu wynikow pr...,premier australia scott morrison,sensacja australia centroprawica dwoch przegry...,24801435
3,milosierdzia zrodlem pojednanie taki haslem ob...,ideologia gender propagowac karty lgbt uderzaj...,swieto milosierdzia bozego sanktuarium lagiewn...,abp jedraszewski pedofilia kosciele musiec pot...,24708858
4,drugi polfinal konkurs piosenka eurowizja dobr...,czwartkowy polfinal eurowizja mogl wydawac mal...,sergey lazarev,drugi polfinal eurowizja wyzszym poziom stado ...,24794052


In [11]:
from collections import Counter

In [30]:
lem_df = lem_df.fillna('')

In [31]:
bag_of_records = lem_df['title'] + ' ' + lem_df['highlight'] + ' ' + lem_df['content'] + ' ' + lem_df['media_desc']
bag_of_words = [x for x in [row for row in bag_of_records] for x in x.split(' ')]

In [33]:
idf_dict = {word:freq for word, freq in dict(Counter(bag_of_words)).items() if freq > 2}

In [35]:
from json import dump

In [36]:
with open('labeling/idf_dict', 'w') as f:
    dump(idf_dict, f)

In [37]:
lem_df.to_csv('labeling/lemmatized_articles.csv', index = False)