In [102]:
# author, division, idf from raw comments
# and labels like {comment_id: {"rage": 2, "replies": 3 ...}}
# get the most frequent words as well to be able to filter word2vec file

# out files:
# - author_dict
# - division_dict
# - idf_dict

# - file with x and y 

In [103]:
from os import listdir
import pandas as pd
from labeling.emotions import rage, joy, sadness, surprise, fear

In [104]:
months = {
    'stycznia': 1,
    'lutego': 2,
    'marca': 3,
    'kwietnia': 4,
    'maja': 5,
    'czerwca': 6,
    'lipca': 7,
    'sierpnia': 8,
    'września': 9,
    'października': 10,
    'listopada': 11,
    'grudnia': 12
}

In [105]:
def process_date(months, date):
    year = date.split('|')[0].split(' ')[2].strip()
    month = str(months[date.split('|')[0].split(' ')[1].strip()])
    day = date.split('|')[0].split(' ')[0].strip()
    
    try:
        hour = date.split('|')[1].strip()
        return pd.to_datetime(" ".join([year, month, day, hour]))
    except:
        return None
    
    

In [106]:
from re import search, sub
from unidecode import unidecode

In [107]:
rage_regex = "|".join(rage)
joy_regex = "|".join(joy)
sadness_regex = "|".join(sadness)
surprise_regex = "|".join(surprise)
fear_regex = "|".join(fear)

In [108]:
def find_emotions(emotions, body):
    for i, emotion_regex in enumerate([rage_regex, joy_regex, sadness_regex, surprise_regex, fear_regex]):
        if search(emotion_regex, body):
            emotions[i] += 1
    return emotions

In [109]:
def label(comments):
    replies = 0

    upvotes = 0
    downvotes = 0

    emotions = [0,0,0,0,0]
    for comment in a['comments']:
        main = comment['main_comment']

        upvotes += int(main['upvotes'])
        downvotes += int(main['downvotes'])
        replies += 1

        emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
        for subcomment in comment['sub_comments']:
            try:
                upvotes += int(subcomment['upvotes'])
                downvotes += int(subcomment['downvotes'])
                replies += 1
            except:
                print(comments)
            emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
    return [replies, upvotes, downvotes] + [emotion for emotion in emotions]

In [110]:
def remove_specials(body):
    return sub(r"[^a-z| ]", "", unidecode(body).lower())

In [111]:
with open('data/polish.stopwords.txt', 'r') as f:
    stop_words = f.read().split('\n')

In [112]:
def remove_stop_words(body):
    return [word for word in body.split(' ') if word not in stop_words and len(word) > 1]

In [113]:
def clean_words(body):
    return " ".join(remove_stop_words(remove_specials(body)))

In [114]:
all_articles =[]
for article in listdir('data/comments/'):
    with open('data/comments/'+article) as f:
        articles = []
        a = eval(f.read())['row']['comments']
        if a['comments'] != []:
        
            articles.append(article)
            articles.append(a['division'])
            articles.append(process_date(months, a['pub_date']))
            articles.append(a['author'].split(',')[0])

            articles.append(clean_words(a['title']))
            articles.append(clean_words(a['highlight']))
            articles.append(clean_words(a['content']))
            articles.append(clean_words(a['media_desc']))

            articles.append(a['media_type'])
            for return_label in label(a['comments']):
                articles.append(return_label)
            all_articles.append(articles)

In [115]:
df = pd.DataFrame(all_articles, columns = [
    'id', 'div', 'date', 'author', 'title', 
    'highlight', 'content', 'media_desc', 'media_type',
    'replies', 'upvotes', 'downvotes', 
    'rage', 'joy', 'sadness', 'surprise', 'fear'
])
df['reactions'] = df['upvotes'] + df['downvotes']
df['reactions_sentiment'] = round(df['upvotes'] / df['reactions'], 2)

In [116]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,cyrk zwierzetami gdansku mieszkancy oburzeni b...,mieszkancy osowej chca oknami wystepowal cyrk ...,wtorek gdansku wystapil cyrk zalewski udzialem...,,image,5,51,9,0,0,0,0,0,60,0.85
1,24788214,porodówka,2019-05-15 16:30:00,mag,matka obwinia ginekologa smierc coreczki izba ...,natalia zaczela rodzic tygodniu ciazy porodu d...,decyzja rzecznika oznacza lekarz krzysztof pon...,wczesniak inkubatorze gramowej nadii dane traf...,image,5,52,27,0,0,0,0,0,79,0.66
2,24801435,Świat,2019-05-19 08:43:00,Maciej Czarnecki,sensacja australii centroprawica dwoch przegry...,wierzylem cuda powiedzial ogloszeniu wynikow p...,sobotnich wyborach australijczycy wybierali cz...,premier australii scott morrison,image,15,82,27,0,0,0,0,0,109,0.75


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 19 columns):
id                     1090 non-null object
div                    1090 non-null object
date                   1089 non-null datetime64[ns]
author                 1090 non-null object
title                  1090 non-null object
highlight              1090 non-null object
content                1090 non-null object
media_desc             1090 non-null object
media_type             1090 non-null object
replies                1090 non-null int64
upvotes                1090 non-null int64
downvotes              1090 non-null int64
rage                   1090 non-null int64
joy                    1090 non-null int64
sadness                1090 non-null int64
surprise               1090 non-null int64
fear                   1090 non-null int64
reactions              1090 non-null int64
reactions_sentiment    1073 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(9), object(8

In [118]:
def create_dict(unique):
    return { key: value for value, key in enumerate(unique) }

In [119]:
division_dict = create_dict(df['div'].unique())

In [120]:
author_dict = create_dict(df['author'].unique())

In [121]:
media_dict = create_dict(df['media_type'].unique())

In [125]:
from json import dump

In [126]:
with open('labeling/division_dict', 'w') as f:
    dump(division_dict, f)

In [127]:
# df['div'] = df['div'].map(division_dict)

In [128]:
with open('labeling/author_dict', 'w') as f:
    dump(author_dict, f)

In [129]:
# df['author'] = df['author'].map(author_dict)

In [131]:
with open('labeling/media_dict', 'w') as f:
    dump(media_dict, f)

In [132]:
# df['media_type'] = df['media_type'].map(media_dict)

In [133]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,cyrk zwierzetami gdansku mieszkancy oburzeni b...,mieszkancy osowej chca oknami wystepowal cyrk ...,wtorek gdansku wystapil cyrk zalewski udzialem...,,image,5,51,9,0,0,0,0,0,60,0.85
1,24788214,porodówka,2019-05-15 16:30:00,mag,matka obwinia ginekologa smierc coreczki izba ...,natalia zaczela rodzic tygodniu ciazy porodu d...,decyzja rzecznika oznacza lekarz krzysztof pon...,wczesniak inkubatorze gramowej nadii dane traf...,image,5,52,27,0,0,0,0,0,79,0.66
2,24801435,Świat,2019-05-19 08:43:00,Maciej Czarnecki,sensacja australii centroprawica dwoch przegry...,wierzylem cuda powiedzial ogloszeniu wynikow p...,sobotnich wyborach australijczycy wybierali cz...,premier australii scott morrison,image,15,82,27,0,0,0,0,0,109,0.75


In [134]:
df.to_csv('labeling/fresh_data.csv', index = False)