## Training parameters

In [1]:
rows_limit = 15000
selected_features = ['dates', 'categories', 'tfidf'] # ['dates', 'categories', 'vectors', 'tfidf']
features_percentile = 30
selected_label = 'reactions_cat'
min_count = 5
#       ['replies', 'upvotes', 'downvotes', 'rage', 'joy', 'sadness', 'surprise',
#        'fear', 'reactions', 'reactions_sentiment', 'replies_cat',
#        'upvotes_cat', 'downvotes_cat', 'reactions_cat',
#        'reactions_sentiment_cat', 'rage_cat', 'joy_cat', 'sadness_cat',
#        'surprise_cat', 'fear_cat']

## Data reading

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('labeling/fresh_data.csv', nrows = rows_limit).set_index('id')
df = df.drop(['title', 'highlight', 'content', 'media_desc'], axis = 1)

In [4]:
df.head(3)

Unnamed: 0_level_0,div,date,author,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
24162636,Wiadomości z Poznania,2018-11-13 12:31:00,Tomasz Cylka,image,10,123,0,0,1,0,0,0,123,1.0
24573799,brexit,2019-03-22 10:02:00,Tomasz Bielecki,video,98,655,297,0,0,2,0,0,952,0.69
24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,image,5,51,9,0,0,0,0,0,60,0.85


## Data preparation

In [5]:
labels_columns = ['replies', 'upvotes', 'downvotes', 'rage', 'joy', 'sadness', 'surprise', 
                  'fear', 'reactions', 'reactions_sentiment']
labels_df = df[labels_columns]
df = df.drop(labels_columns, axis = 1)

In [6]:
def process_dates(df):
    df['date'] = pd.to_datetime(df['date'])
    df = pd.concat([
        df,
        pd.get_dummies(df['date'].dt.weekday, prefix = 'weekday'),
        pd.get_dummies(df['date'].dt.hour, prefix = 'hour'),
    ], axis = 1)
    df = df.drop('date', axis = 1)
    return df

In [7]:
if 'dates' in selected_features:
    df = process_dates(df)

In [8]:
df.shape

(14412, 34)

In [9]:
def process_author_div_and_media_type(df):
    df = pd.concat([
        df,
        pd.get_dummies(df['author'], prefix = 'author'),
        pd.get_dummies(df['div'], prefix = 'div'),
        pd.get_dummies(df['media_type'], prefix = 'media_type')
    ], axis = 1)
    df = df.drop(['author', 'div', 'media_type'], axis = 1)
    return df

In [10]:
if 'categories' in selected_features:
    df = process_author_div_and_media_type(df)

In [11]:
df.head(3)

Unnamed: 0_level_0,weekday_0.0,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,weekday_6.0,hour_0.0,hour_1.0,hour_2.0,...,div_Żywiec,div_żaglowce,div_żołnierze wyklęci,div_żubr,div_żużel,div_żydzi w warszawie,div_żłobki,media_type_image,media_type_multiple_images,media_type_video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24162636,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
24573799,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24760624,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
df.shape

(14412, 3099)

In [13]:
def add_vectors(columns):
    column_vectors = []
    for column in columns:
        tmp = pd.read_csv('labeling/'+column+'_vec').set_index('id')
        tmp.columns = [column+"_"+x for x in tmp.columns]
        column_vectors.append(tmp)
    return pd.concat(column_vectors, axis = 1)

In [14]:
if 'vectors' in selected_features:
    df = pd.concat([df, add_vectors(['content', 'highlight', 'title', 'media_desc'])], axis = 1)

In [15]:
df.shape

(14412, 3099)

In [16]:
from json import load
import numpy as np
from collections import Counter

In [17]:
with open('labeling/idf_dict', 'r') as f:
    idf = {key: value for key, value in load(f).items() if value >= min_count}
    idf_voc = idf.keys()

In [18]:
lemmas = pd.read_csv('labeling/lemmatized_articles.csv', nrows = rows_limit).set_index('id').fillna('')

In [19]:
lemmas.head(1)

Unnamed: 0_level_0,content,highlight,media_desc,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24162636,rok ten wojewoda wielkopolski zbigniew hoffman...,ostateczny decyzja nazwa ul luty wrocic mape p...,poznanie najwieksze kontrowersja wzbudzila zmi...,poznanie powroci ul luty wojewoda zbigniew hof...


In [20]:
def add_tf_idf(columns):
    whole_string = ''
    for column in columns:
        whole_string = whole_string + lemmas[column] + ' '
    tf_idf = []
    for record in whole_string.values:
        return_record = []
        counts = dict(Counter(record.split(' ')))
        for feature_word in idf_voc:
            if feature_word in counts.keys():
                return_record.append(counts[feature_word]/idf[feature_word])
            else:
                return_record.append(0)
        tf_idf.append(return_record)
        
    tmp = pd.DataFrame(tf_idf, index = lemmas.index)
    tmp.columns = ['tf_idf_'+x for x in idf_voc]
    
    return tmp

In [None]:
if 'tfidf' in selected_features:
    df = pd.concat([df, add_tf_idf(['content', 'highlight', 'title', 'media_desc'])], axis = 1)

In [None]:
df.shape

In [None]:
labels_df.shape

## Categorize labels

In [None]:
labels_df.head(3)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, axes = plt.subplots(4)
for i, column in enumerate(['replies', 'upvotes', 'downvotes', 'reactions']):
    threshold = labels_df[column].quantile(0.95)
    print(threshold)
    labels_df.loc[labels_df[column] > threshold, column] = threshold
    labels_df[column].plot(kind = 'hist', bins = 100, figsize= (15,10), title = column, ax = axes[i])

In [None]:
labels_df['reactions_sentiment'] = labels_df['reactions_sentiment'].fillna(0)

In [None]:
categories = []
categories_columns = ['replies', 'upvotes', 'downvotes', 'reactions', 'reactions_sentiment']
for column in categories_columns:
    categories.append(pd.qcut(labels_df[column], 3, labels = [-1, 0, 1]))
tmp_cat = pd.concat(categories, axis = 1)
tmp_cat.columns = [x+"_cat" for x in categories_columns]
labels_df = pd.concat([labels_df, tmp_cat], axis = 1)

In [None]:
labels_df.head(3)

In [None]:
for emotion in ['rage', 'joy', 'sadness', 'surprise', 'fear']:
    labels_df.loc[labels_df[emotion] == 0, emotion+'_cat'] = 0
    labels_df.loc[labels_df[emotion] != 0, emotion+'_cat'] = 1

In [None]:
fig, axes = plt.subplots(1, 5, figsize = (20,3))
for i, emotion_cat in enumerate(['rage_cat', 'joy_cat', 'sadness_cat', 'surprise_cat', 'fear_cat']):
    labels_df[[emotion_cat]].reset_index().groupby(emotion_cat).count().plot(kind = 'bar', ax = axes[i])

### Check for NaN

In [None]:
df[df.isna().any(axis=1)]

In [None]:
labels_df[labels_df.isna().any(axis=1)]

## Features selection

In [None]:
X = df.values

In [None]:
Y = labels_df[[selected_label]].values

In [None]:
X.shape

In [None]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif

In [None]:
clf = SelectPercentile(f_classif, percentile = features_percentile).fit(X, Y)

In [None]:
selected_columns = df.columns[clf.get_support()].tolist()

In [None]:
X = clf.transform(X)

In [None]:
X.shape

## Normalize labels

## Define X and Y

In [None]:
s = int(len(df)*0.9)

In [None]:
s

In [None]:
train_x = X[:s]
train_y = Y[:s]

In [None]:
test_x = X[s:]
test_y = Y[s:]

## Training

In [None]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense
from tensorflow.nn import relu

In [None]:
inputs = Input(shape = (len(selected_columns), ))

In [None]:
hidden_1 = Dense(int(len(selected_columns)/100)+50, activation = relu)(inputs)
hidden_2 = Dense(int(len(selected_columns)/200)+25, activation = relu)(hidden_1)

In [None]:
outputs = Dense(1)(hidden_2)

In [None]:
model = Model(
    inputs = inputs,
    outputs = outputs
)

In [None]:
from tensorflow.keras.optimizers import RMSprop

In [None]:
optimizer = RMSprop(0.01)

In [None]:
from tensorflow.losses import *

In [None]:
model.compile(
    loss=mean_squared_error,
    optimizer=optimizer,
    metrics=['mean_absolute_error']
)

In [None]:
example_batch = test_x[:5]
print(model.predict(example_batch))

In [None]:
EPOCHS = 300

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=10)

In [None]:
model.summary()

In [None]:
history = model.fit(
    train_x, train_y,
    epochs=EPOCHS,
    validation_split = 0.15, 
    verbose=0,
    callbacks=[early_stop],
)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [MPG]')
    plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
    plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
    m = max(max([history.history['mean_absolute_error'] + history.history['val_mean_absolute_error']]))
    plt.ylim([0,m * 1.1])
    plt.legend()

In [None]:
loss, mae = model.evaluate(test_x, test_y, verbose=0)

In [None]:
mae

In [None]:
plot_history(history)

In [None]:
res = pd.concat([pd.DataFrame(model.predict(test_x)), pd.DataFrame(test_y)], axis = 1)
res.columns = ['prediction', 'real']
no_categories = len(res['real'].unique())

In [None]:
if no_categories == 2:
    res.loc[res['prediction'] < 0, 'prediction_cat'] = 0
    res.loc[res['prediction'] >= 0, 'prediction_cat'] = 1
elif no_categories == 3:
    res.loc[res['prediction'] < -0.66, 'prediction_cat'] = -1
    res.loc[res['prediction'] >  0.66, 'prediction_cat'] = 1
    res['prediction_cat'] = res['prediction_cat'].fillna(0)

In [None]:
res.loc[res['real'] == res['prediction_cat'], 'if_ok'] = 1
res['if_ok'] = res['if_ok'].fillna(0)

In [None]:
res.sample(frac=1).head(6)

In [None]:
print('accuracy: ',round(res['if_ok'].sum()/len(res)*100, 2), '%')

In [None]:
from sklearn.metrics import confusion_matrix
from seaborn import heatmap

In [None]:
if no_categories == 2:
    c = [0,1]
else:
    c = [-1,0,1]

In [None]:
heatmap(confusion_matrix(res['real'], res['prediction_cat']), xticklabels = c, yticklabels = c, annot=True)