In [1]:
# author, division, idf from raw comments
# and labels like {comment_id: {"rage": 2, "replies": 3 ...}}
# get the most frequent words as well to be able to filter word2vec file

# out files:
# - author_dict
# - division_dict
# - idf_dict

# - file with x and y 

In [2]:
from os import listdir
import pandas as pd
from labeling.emotions import rage, joy, sadness, surprise, fear

In [3]:
months = {
    'stycznia': 1,
    'lutego': 2,
    'marca': 3,
    'kwietnia': 4,
    'maja': 5,
    'czerwca': 6,
    'lipca': 7,
    'sierpnia': 8,
    'września': 9,
    'października': 10,
    'listopada': 11,
    'grudnia': 12
}

In [4]:
def process_date(months, date):
    year = date.split('|')[0].split(' ')[2].strip()
    month = str(months[date.split('|')[0].split(' ')[1].strip()])
    day = date.split('|')[0].split(' ')[0].strip()
    
    try:
        hour = date.split('|')[1].strip()
        return pd.to_datetime(" ".join([year, month, day, hour]))
    except:
        return None
    
    

In [5]:
from re import search, sub
from unidecode import unidecode

In [7]:
more_emotion_keywords = pd.read_csv('labeling/nawl-analysis.csv')

In [None]:
# H - joy
# A - rage
# S - sadness
# D - surprise
# F - fear

In [21]:
Hapiness = more_emotion_keywords[more_emotion_keywords['category'] == 'H']['word'].values.tolist()
Anger = more_emotion_keywords[more_emotion_keywords['category'] == 'A']['word'].values.tolist()
Sadness = more_emotion_keywords[more_emotion_keywords['category'] == 'S']['word'].values.tolist()
Disgust = more_emotion_keywords[more_emotion_keywords['category'] == 'D']['word'].values.tolist()
Fear = more_emotion_keywords[more_emotion_keywords['category'] == 'F']['word'].values.tolist()

In [22]:
rage_regex = "|".join(rage + Anger)
joy_regex = "|".join(joy + Hapiness)
sadness_regex = "|".join(sadness + Sadness)
surprise_regex = "|".join(surprise + Disgust)
fear_regex = "|".join(fear + Fear)

In [23]:
def find_emotions(emotions, body):
    for i, emotion_regex in enumerate([rage_regex, joy_regex, sadness_regex, surprise_regex, fear_regex]):
        if search(emotion_regex, body):
            emotions[i] += 1
    return emotions

In [24]:
def label(comments):
    replies = 0

    upvotes = 0
    downvotes = 0

    emotions = [0,0,0,0,0]
    for comment in a['comments']:
        main = comment['main_comment']

        upvotes += int(main['upvotes'])
        downvotes += int(main['downvotes'])
        replies += 1

        emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
        for subcomment in comment['sub_comments']:
            try:
                upvotes += int(subcomment['upvotes'])
                downvotes += int(subcomment['downvotes'])
                replies += 1
            except:
                return None
            emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
    return [replies, upvotes, downvotes] + [emotion for emotion in emotions]

In [25]:
def remove_specials(body):
    return sub(r"[^a-z| ]", "", unidecode(body).lower())

In [26]:
with open('data/polish.stopwords.txt', 'r') as f:
    stop_words = f.read().split('\n')

In [27]:
def remove_stop_words(body):
    return [word for word in body.split(' ') if word not in stop_words and len(word) > 1]

In [28]:
def clean_words(body):
    return " ".join(remove_stop_words(remove_specials(body)))

In [29]:
all_articles =[]
for article in listdir('data/comments/'):
    with open('data/comments/'+article) as f:
        articles = []
        a = eval(f.read())['row']['comments']
        if a['comments'] != []:
        
            articles.append(article)
            articles.append(a['division'])
            articles.append(process_date(months, a['pub_date']))
            articles.append(a['author'].split(',')[0])

            articles.append(clean_words(a['title']))
            articles.append(clean_words(a['highlight']))
            articles.append(clean_words(a['content']))
            articles.append(clean_words(a['media_desc']))

            articles.append(a['media_type'])
            labels = label(a['comments'])
            if labels:
                for return_label in labels:
                    articles.append(return_label)
                all_articles.append(articles)

In [30]:
df = pd.DataFrame(all_articles, columns = [
    'id', 'div', 'date', 'author', 'title', 
    'highlight', 'content', 'media_desc', 'media_type',
    'replies', 'upvotes', 'downvotes', 
    'rage', 'joy', 'sadness', 'surprise', 'fear'
])
df['reactions'] = df['upvotes'] + df['downvotes']
df['reactions_sentiment'] = round(df['upvotes'] / df['reactions'], 2)

In [31]:
len(df)

20274

In [32]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,23632593,ZUS i emerytura,2018-07-05 06:02:00,Leszek Kostrzewski,nik alarmuje polacy maja zanizone emerytury,tysiace polakow zanizone emerytury wystapilo z...,najwyzsza izba kontroli zwraca uwage problem z...,osob maja dopisanych skladek emerytalnych czas...,image,2,11,0,0,0,0,0,1,11,1.0
1,23398351,Świat,2018-05-14 15:45:00,Maciej Stasiński,katalonia premiera parlament barcelonie przegl...,nowy premier obiecuje dazyc oderwania kataloni...,kryzys katalonski zazegnany parlament barcelon...,parlament barcelonie zatwierdzil quima torre n...,image,5,2,5,0,4,0,4,0,7,0.29
2,23602572,Nauka,2018-06-27 15:37:00,Adam Wajrak,wajrak straszcie wilkami,pojawily pierwsze informacje pogryzieniu wilka...,mysliwi zwykle trafili kula plotwiemy przynajm...,bieszczady wilk zaatakowal dzieci mial kontakt...,image,9,210,13,0,0,0,0,2,223,0.94


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20274 entries, 0 to 20273
Data columns (total 19 columns):
id                     20274 non-null object
div                    20274 non-null object
date                   20272 non-null datetime64[ns]
author                 20274 non-null object
title                  20274 non-null object
highlight              20274 non-null object
content                20274 non-null object
media_desc             20274 non-null object
media_type             20274 non-null object
replies                20274 non-null int64
upvotes                20274 non-null int64
downvotes              20274 non-null int64
rage                   20274 non-null int64
joy                    20274 non-null int64
sadness                20274 non-null int64
surprise               20274 non-null int64
fear                   20274 non-null int64
reactions              20274 non-null int64
reactions_sentiment    19797 non-null float64
dtypes: datetime64[ns](1), float64(1

In [35]:
def create_dict(unique):
    return { key: value for value, key in enumerate(unique) }

In [36]:
division_dict = create_dict(df['div'].unique())

In [37]:
author_dict = create_dict(df['author'].unique())

In [38]:
media_dict = create_dict(df['media_type'].unique())

In [39]:
from json import dump

In [40]:
with open('labeling/division_dict', 'w') as f:
    dump(division_dict, f)

In [41]:
# df['div'] = df['div'].map(division_dict)

In [42]:
with open('labeling/author_dict', 'w') as f:
    dump(author_dict, f)

In [43]:
# df['author'] = df['author'].map(author_dict)

In [44]:
with open('labeling/media_dict', 'w') as f:
    dump(media_dict, f)

In [45]:
# df['media_type'] = df['media_type'].map(media_dict)

In [46]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,23632593,ZUS i emerytura,2018-07-05 06:02:00,Leszek Kostrzewski,nik alarmuje polacy maja zanizone emerytury,tysiace polakow zanizone emerytury wystapilo z...,najwyzsza izba kontroli zwraca uwage problem z...,osob maja dopisanych skladek emerytalnych czas...,image,2,11,0,0,0,0,0,1,11,1.0
1,23398351,Świat,2018-05-14 15:45:00,Maciej Stasiński,katalonia premiera parlament barcelonie przegl...,nowy premier obiecuje dazyc oderwania kataloni...,kryzys katalonski zazegnany parlament barcelon...,parlament barcelonie zatwierdzil quima torre n...,image,5,2,5,0,4,0,4,0,7,0.29
2,23602572,Nauka,2018-06-27 15:37:00,Adam Wajrak,wajrak straszcie wilkami,pojawily pierwsze informacje pogryzieniu wilka...,mysliwi zwykle trafili kula plotwiemy przynajm...,bieszczady wilk zaatakowal dzieci mial kontakt...,image,9,210,13,0,0,0,0,2,223,0.94


In [47]:
df.to_csv('labeling/fresh_data.csv', index = False)