In [120]:
import pandas as pd
from unidecode import unidecode
from collections import Counter
import numpy as np

In [79]:
articles = pd.read_csv('data/processed_articles.csv', nrows = 10000)

In [80]:
articles.head(1)

Unnamed: 0,content,division,highlight,media_desc,title,url
0,najwyzsza izba kontroli zwraca uwage problem z...,zus emerytura,tysiace polakow zanizone emerytury wystapilo z...,osob maja dopisanych skladek emerytalnych czas...,nik alarmuje polacy maja zanizone emerytury,"http://wyborcza.biz/biznes/7,147880,23632593,n..."


In [81]:
len(articles)

10000

In [82]:
articles = articles.dropna()

In [83]:
len(articles)

9984

## Prepare input for model

In [116]:
def concat_x_string(columns):
    a = pd.Series('', index = articles.index)
    for column in columns:
        a = a + articles[column] + ' '
    return a.str.strip().str.split(' ').values

##### Chose columns for model input

In [117]:
string_columns = concat_x_string(['title', 'media_desc'])

In [164]:
string_columns[1]

['katalonia',
 'premiera',
 'parlament',
 'barcelonie',
 'przeglosowal',
 'kandydature',
 'secesjonisty',
 'quima',
 'torry',
 'parlament',
 'barcelonie',
 'zatwierdzil',
 'quima',
 'torre',
 'nowego',
 'premiera',
 'katalonii',
 'maja',
 'r']

In [165]:
idf = dict(Counter([word for content in string_columns for word in content]))

In [166]:
filtered_idf = {word: freq for word, freq in idf.items() if freq > len(articles) * 0.001}

##### Ignore words which appear very rare - they won't be very useful (and dataset won't fit into my ram)

In [167]:
# I want only words which appear in number of 0.1% times number of articles
def filter_not_frequent(string_columns, filtered_idf):
    filtered = []
    for record in string_columns:
        filtered.append([word for word in record if word in filtered_idf.keys()])
    return filtered

In [168]:
filtered = filter_not_frequent(string_columns, filtered_idf)

In [169]:
filtered[1]

['premiera', 'parlament', 'parlament', 'nowego', 'premiera', 'maja', 'r']

In [176]:
unique_words = list(set(filtered_idf.keys()))
vocab_size = len(unique_words)

In [177]:
vocab_size

2679

In [178]:
word2int = {}
int2word = {}

for i,word in enumerate(unique_words):
    word2int[word] = i
    int2word[i] = word

##### Change words to vectors

In [181]:
def string_x_to_tf_idf(records, idf, vocab_size, word2int):
    dataset = []
    for record in records:
        tmp_vector = np.zeros(vocab_size)
        for word, counter in Counter(record).items():
            tmp_vector[word2int[word]] = counter / idf[word]
        dataset.append(tmp_vector)
    
    return np.asarray(dataset)

In [188]:
x = string_x_to_tf_idf(filtered, filtered_idf, vocab_size, word2int)

In [189]:
len(x)

9984

In [190]:
len(articles['url'])

9984

## Prepare labels

In [196]:
from labeling.emotions import rage, joy, sadness, surprise, fear, all_flag_words

In [237]:
comments_raw = pd.concat([
    pd.read_csv('data/processed_comments_1.csv'),
    pd.read_csv('data/processed_comments_2.csv')
])[['url', 'downvotes', 'upvotes', 'words']]

In [238]:
comments_raw.head(1)

Unnamed: 0,url,downvotes,upvotes,words
0,"http://wyborcza.biz/biznes/7,147880,23632593,n...",0,10,"['brak', 'kwartalnych', 'waloryzacji', 'kapita..."


In [239]:
comment_emotions = []
for comment in comments_raw['words'].values:
    record_labels = [0, 0, 0, 0, 0, 0]
    eval_comment = eval(comment)
    
    for i, emotion in enumerate([rage, joy, sadness, surprise, fear]):
        for flag_word in emotion:
            if flag_word in eval_comment:
                record_labels[i] += 1
        if record_labels[i] != 0:
            record_labels[5] += 1
                
    comment_emotions.append(record_labels)

In [240]:
emotions_df = pd.DataFrame(comment_emotions, index = comments_raw.index)
emotions_df.columns = ['rage', 'joy', 'sadness', 'surprise', 'fear', 'emotions_count']

In [241]:
comments = pd.concat([comments_raw, emotions_df], axis = 1).drop('words', axis = 1)

In [242]:
comments[comments['url'] == 'http://bialystok.wyborcza.pl/bialystok/7,35241,23042200,komorowski-budowanie-dumy-z-polskiej-historii-konczy-sie-wstydem.html']

Unnamed: 0,url,downvotes,upvotes,rage,joy,sadness,surprise,fear,emotions_count
221003,"http://bialystok.wyborcza.pl/bialystok/7,35241...",2,14,0,0,0,0,0,0
221004,"http://bialystok.wyborcza.pl/bialystok/7,35241...",9,6,0,0,0,0,0,0
221005,"http://bialystok.wyborcza.pl/bialystok/7,35241...",8,1,0,0,0,0,0,0
221006,"http://bialystok.wyborcza.pl/bialystok/7,35241...",1,4,0,0,0,0,0,0


In [243]:
sums = comments\
    .groupby('url')\
    .sum()

In [244]:
counts = comments\
    [['url', 'downvotes']]\
    .groupby('url')\
    .count()
counts.columns = ['counter']

In [253]:
len(labels)

19469

In [258]:
labels = pd.concat([counts, sums], axis = 1).reindex(articles['url'])

In [259]:
len(labels)

9984

In [260]:
labels.head(2)

Unnamed: 0_level_0,counter,downvotes,upvotes,rage,joy,sadness,surprise,fear,emotions_count
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"http://wyborcza.biz/biznes/7,147880,23632593,nik-alarmuje-polacy-maja-zanizone-emerytury.html",2.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
"http://wyborcza.pl/7,75399,23398351,katalonia-ma-premiera-parlament-w-barcelonie-przeglosowal-kandydature.html",5.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
