In [1]:
# author, division, idf from raw comments
# and labels like {comment_id: {"rage": 2, "replies": 3 ...}}
# get the most frequent words as well to be able to filter word2vec file

# out files:
# - author_dict
# - division_dict
# - idf_dict

# - file with x and y 

In [21]:
from os import listdir
import pandas as pd
from labeling.emotions import rage, joy, sadness, surprise, fear

In [3]:
months = {
    'stycznia': 1,
    'lutego': 2,
    'marca': 3,
    'kwietnia': 4,
    'maja': 5,
    'czerwca': 6,
    'lipca': 7,
    'sierpnia': 8,
    'września': 9,
    'października': 10,
    'listopada': 11,
    'grudnia': 12
}

In [4]:
def process_date(months, date):
    year = date.split('|')[0].split(' ')[2].strip()
    month = str(months[date.split('|')[0].split(' ')[1].strip()])
    day = date.split('|')[0].split(' ')[0].strip()
    
    hour = date.split('|')[1].strip()
    
    return pd.to_datetime(" ".join([year, month, day, hour]))

In [None]:
from re import search, sub
from unidecode import unidecode

In [35]:
rage_regex = "|".join(rage)
joy_regex = "|".join(joy)
sadness_regex = "|".join(sadness)
surprise_regex = "|".join(surprise)
fear_regex = "|".join(fear)

In [34]:
def find_emotions(emotions, body):
    for i, emotion_regex in enumerate([rage_regex, joy_regex, sadness_regex, surprise_regex, fear_regex]):
        if search(emotion_regex, body):
            emotions[i] += 1
    return emotions

In [63]:
def label(comments):
    replies = 0

    upvotes = 0
    downvotes = 0

    emotions = [0,0,0,0,0]
    for comment in a['comments']:
        main = comment['main_comment']

        upvotes += int(main['upvotes'])
        downvotes += int(main['downvotes'])
        replies += 1

        emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
        for subcomment in comment['sub_comments']:
            upvotes += int(subcomment['upvotes'])
            downvotes += int(subcomment['downvotes'])
            replies += 1
            emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
    return [replies, upvotes, downvotes] + [emotion for emotion in emotions]

In [105]:
def remove_specials(body):
    return sub(r"[^a-z| ]", "", unidecode(body).lower())

In [106]:
with open('data/polish.stopwords.txt', 'r') as f:
    stop_words = f.read().split('\n')

In [107]:
def remove_stop_words(body):
    return [word for word in body.split(' ') if word not in stop_words]

In [117]:
def clean_words(body):
    return " ".join(remove_stop_words(remove_specials(body)))

In [118]:
all_articles =[]
for article in listdir('data/comments/'):
    with open('data/comments/'+article) as f:
        articles = []
        a = eval(f.read())['row']['comments']
        articles.append(article)
        articles.append(a['division'])
        articles.append(process_date(months, a['pub_date']))
        articles.append(a['author'].split(',')[0])
        
        articles.append(clean_words(a['title']))
        articles.append(clean_words(a['highlight']))
        articles.append(clean_words(a['content']))
        articles.append(clean_words(a['media_desc']))
        
        articles.append(a['media_type'])
        for return_label in label(a['comments']):
            articles.append(return_label)
        all_articles.append(articles)

In [119]:
df = pd.DataFrame(all_articles, columns = [
    'id', 'div', 'date', 'author', 'title', 
    'highlight', 'content', 'media_desc', 'media_type',
    'replies', 'upvotes', 'downvotes', 
    'rage', 'joy', 'sadness', 'surprise', 'fear'
])
df['reactions'] = df['upvotes'] + df['downvotes']
df['reactions_sentiment'] = round(df['upvotes'] / df['reactions'], 2)

In [120]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24811329,Kraj,2019-05-22 05:16:00,Agata Kondzińska,teoria spisku pis podejrzewaja ziobro mogl ins...,ujawnieniu wyborcza sprawy dzialek morawieckie...,rozmowcy pis zwracaja uwage publikacja wyborcz...,premier mateusz morawiecki minister sprawiedli...,image,116,2990,106,0,1,0,2,8,3096,0.97
1,24811352,Kraj,2019-05-22 05:21:00,Krzysztof Pacewicz,sondaz drugiej turze wyborow duda wygralby tus...,sondazu prezydenckim andrzej duda pokonuje don...,wynika badan kantaru wyborczej gazetapl tok fm...,marsz europy przeszedl warszawe,video,145,1630,103,0,1,1,1,0,1733,0.94
2,24810476,Świat,2019-05-22 05:25:00,Maria Kruczkowska,legalizacja malzenstw osob jednej plci tajwani...,choc pekin pochwalil nowa ustawe tajpej podejr...,legalizacji malzenstw homoseksualnych pisal pi...,demonstracja srodowisk lgbt tajpej grudnia r,image,7,34,9,0,0,0,0,0,43,0.79


In [125]:
def create_dict(unique):
    return { key: value for key, value in enumerate(unique) }

In [159]:
division_dict = create_dict(df['div'].unique())

In [160]:
author_dict = create_dict(df['author'].unique())

In [133]:
from collections import Counter

In [153]:
bag_of_records = df['title'] + ' ' + df['highlight'] + ' ' + df['content'] + ' ' + df['media_desc']
bag_of_words = [x for x in [row for row in bag_of_records] for x in x.split(' ')]

In [161]:
idf_dict = {word:freq for word, freq in dict(Counter(bag_of_words)).items() if freq > 3}

In [162]:
from json import dump

In [164]:
with open('labeling/division_dict', 'w') as f:
    dump(division_dict, f)

In [165]:
with open('labeling/author_dict', 'w') as f:
    dump(author_dict, f)

In [166]:
with open('labeling/idf_dict', 'w') as f:
    dump(idf_dict, f)

In [167]:
df.to_csv('labeling/fresh_data.csv')