In [51]:
import pandas as pd

In [53]:
df = pd.read_csv('labeling/fresh_data.csv').set_index('id')
df = df.drop(['title', 'highlight', 'content', 'media_desc'], axis = 1)

In [54]:
df.head(3)

Unnamed: 0_level_0,div,date,author,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,image,5,51,9,0,0,0,0,0,60,0.85
24788214,porodówka,2019-05-15 16:30:00,mag,image,5,52,27,0,0,0,0,0,79,0.66
24801435,Świat,2019-05-19 08:43:00,Maciej Czarnecki,image,15,82,27,0,0,0,0,0,109,0.75


## Data preparation

In [55]:
labels_columns = ['replies', 'upvotes', 'downvotes', 'rage', 'joy', 'sadness', 'surprise', 
                  'fear', 'reactions', 'reactions_sentiment']
labels_df = df[labels_columns]
df = df.drop(labels_columns, axis = 1)

In [56]:
def process_dates(df):
    df['date'] = pd.to_datetime(df['date'])
    df = pd.concat([
        df,
        pd.get_dummies(df['date'].dt.weekday, prefix = 'weekday'),
        pd.get_dummies(df['date'].dt.hour, prefix = 'hour'),
    ], axis = 1)
    df = df.drop('date', axis = 1)
    return df

In [57]:
df = process_dates(df)

In [58]:
df.shape

(1090, 32)

In [59]:
def process_author_div_and_media_type(df):
    df = pd.concat([
        df,
        pd.get_dummies(df['author'], prefix = 'author'),
        pd.get_dummies(df['div'], prefix = 'div'),
        pd.get_dummies(df['media_type'], prefix = 'media_type')
    ], axis = 1)
    df = df.drop(['author', 'div', 'media_type'], axis = 1)
    return df

In [60]:
df = process_author_div_and_media_type(df)

In [61]:
df.shape

(1090, 697)

In [64]:
def add_vectors(columns):
    column_vectors = []
    for column in columns:
        tmp = pd.read_csv('labeling/'+column+'_vec').set_index('id')
        tmp.columns = [column+"_"+x for x in tmp.columns]
        column_vectors.append(tmp)
    return pd.concat(column_vectors, axis = 1)

In [65]:
df = pd.concat([df, add_vectors(['content', 'highlight', 'title', 'media_desc'])], axis = 1)

In [66]:
df.shape

(1090, 1897)

In [77]:
from json import load
import numpy as np
from collections import Counter

In [75]:
with open('labeling/idf_dict', 'r') as f:
    idf = load(f)
    idf_voc = idf.keys()

In [73]:
lemmas = pd.read_csv('labeling/lemmatized_articles.csv').set_index('id').fillna('')

In [74]:
lemmas.head(1)

Unnamed: 0_level_0,content,highlight,media_desc,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24760624,wtorek gdansku wystapil cyrk zalewski udzialem...,mieszkancy osowa chca okno wystepowal cyrk zwi...,,cyrk zwierzetami gdansku mieszkancy oburzony b...


In [104]:
def add_tf_idf(columns):
    whole_string = ''
    for column in columns:
        whole_string = whole_string + lemmas[column] + ' '
    tf_idf = []
    for record in whole_string.values:
        return_record = []
        counts = dict(Counter(record.split(' ')))
        for feature_word in idf_voc:
            if feature_word in counts.keys():
                return_record.append(counts[feature_word]/idf[feature_word])
            else:
                return_record.append(0)
        tf_idf.append(return_record)
        
    tmp = pd.DataFrame(tf_idf, index = lemmas.index)
    tmp.columns = ['tf_idf_'+x for x in idf_voc]
    
    return tmp

In [107]:
df = pd.concat([df, add_tf_idf(['content', 'highlight', 'title', 'media_desc'])], axis = 1)

In [109]:
df.shape

(1090, 11052)

In [110]:
labels_df.shape

(1090, 10)

## Features selection

## Training

## Validation