In [1]:
from os import listdir
import pandas as pd
from labeling.emotions import rage, joy, sadness, surprise, fear

In [2]:
months = {
    'stycznia': 1,
    'lutego': 2,
    'marca': 3,
    'kwietnia': 4,
    'maja': 5,
    'czerwca': 6,
    'lipca': 7,
    'sierpnia': 8,
    'września': 9,
    'października': 10,
    'listopada': 11,
    'grudnia': 12
}

In [3]:
def process_date(months, date):
    year = date.split('|')[0].split(' ')[2].strip()
    month = str(months[date.split('|')[0].split(' ')[1].strip()])
    day = date.split('|')[0].split(' ')[0].strip()
    
    try:
        hour = date.split('|')[1].strip()
        return pd.to_datetime(" ".join([year, month, day, hour]))
    except:
        return None
    
    

In [4]:
from re import search, sub
from unidecode import unidecode

In [5]:
# http://exp.lobi.nencki.gov.pl/nawl-analysis
more_emotion_keywords = pd.read_csv('labeling/nawl-analysis.csv')

In [6]:
# H - joy
# A - rage
# S - sadness
# D - surprise
# F - fear

In [7]:
Hapiness = more_emotion_keywords[more_emotion_keywords['category'] == 'H']['word'].values.tolist()
Anger = more_emotion_keywords[more_emotion_keywords['category'] == 'A']['word'].values.tolist()
Sadness = more_emotion_keywords[more_emotion_keywords['category'] == 'S']['word'].values.tolist()
Disgust = more_emotion_keywords[more_emotion_keywords['category'] == 'D']['word'].values.tolist()
Fear = more_emotion_keywords[more_emotion_keywords['category'] == 'F']['word'].values.tolist()

In [8]:
rage_regex = "|".join(rage + Anger)
joy_regex = "|".join(joy + Hapiness)
sadness_regex = "|".join(sadness + Sadness)
surprise_regex = "|".join(surprise + Disgust)
fear_regex = "|".join(fear + Fear)

In [9]:
def find_emotions(emotions, body):
    for i, emotion_regex in enumerate([rage_regex, joy_regex, sadness_regex, surprise_regex, fear_regex]):
        if search(emotion_regex, body):
            emotions[i] += 1
    return emotions

In [10]:
def label(comments):
    replies = 0

    upvotes = 0
    downvotes = 0

    emotions = [0,0,0,0,0]
    for comment in a['comments']:
        main = comment['main_comment']

        upvotes += int(main['upvotes'])
        downvotes += int(main['downvotes'])
        replies += 1

        emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
        for subcomment in comment['sub_comments']:
            try:
                upvotes += int(subcomment['upvotes'])
                downvotes += int(subcomment['downvotes'])
                replies += 1
            except:
                return None
            emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
    return [replies, upvotes, downvotes] + [emotion for emotion in emotions]

In [11]:
def remove_specials(body):
    return sub(r"[^a-z| ]", "", unidecode(body).lower())

In [12]:
with open('data/polish.stopwords.txt', 'r') as f:
    stop_words = f.read().split('\n')

In [13]:
def remove_stop_words(body):
    return [word for word in body.split(' ') if word not in stop_words and len(word) > 1]

In [14]:
def clean_words(body):
    return " ".join(remove_stop_words(remove_specials(body)))

In [15]:
all_articles =[]
for article in listdir('data/comments/'):
    with open('data/comments/'+article) as f:
        articles = []
        a = eval(f.read())['row']['comments']
        if a['comments'] != []:
        
            articles.append(article)
            articles.append(a['division'])
            articles.append(process_date(months, a['pub_date']))
            articles.append(a['author'].split(',')[0])

            articles.append(clean_words(a['title']))
            articles.append(clean_words(a['highlight']))
            articles.append(clean_words(a['content']))
            articles.append(clean_words(a['media_desc']))

            articles.append(a['media_type'])
            labels = label(a['comments'])
            if labels:
                for return_label in labels:
                    articles.append(return_label)
                all_articles.append(articles)

In [16]:
df = pd.DataFrame(all_articles, columns = [
    'id', 'div', 'date', 'author', 'title', 
    'highlight', 'content', 'media_desc', 'media_type',
    'replies', 'upvotes', 'downvotes', 
    'rage', 'joy', 'sadness', 'surprise', 'fear'
])
df['reactions'] = df['upvotes'] + df['downvotes']
df['reactions_sentiment'] = round(df['upvotes'] / df['reactions'], 2)

In [17]:
len(df)

35233

In [18]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24247530,dekomunizacja ulic,2018-12-06 13:00:00,Jarosław Osowski,czas wyczekiwany final dekomunizacji ulic piat...,naczelny sad administracyjny rozny finalizuje ...,piatkowej wokandzie nsa figuruje nazw stoleczn...,marzec zakladanie nowych tablic dawnej al armi...,image,7,44,1,0,2,0,0,2,45,0.98
1,24593465,CBA,2019-03-28 13:40:00,mch,cba zatrzymalo pieciu podkarpackich przedsiebi...,rzeszowska delegatura centralnego biura antyko...,mezczyzni dzialali zorganizowanej grupie przes...,cba,image,3,9,0,0,0,0,0,0,9,1.0
2,24376781,Kraj,2019-01-18 14:17:00,Antonio Tajani,swieca pawla wyprzec ciemnosc,stanie pogodzic mowa nienawisci rozprzestrzeni...,kazde morderstwo tragedia pawel adamowicz wied...,prezydent miasta pawel adamowicz zyje,image,28,567,10,0,3,0,0,8,577,0.98


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35233 entries, 0 to 35232
Data columns (total 19 columns):
id                     35233 non-null object
div                    35233 non-null object
date                   35231 non-null datetime64[ns]
author                 35233 non-null object
title                  35233 non-null object
highlight              35233 non-null object
content                35233 non-null object
media_desc             35233 non-null object
media_type             35233 non-null object
replies                35233 non-null int64
upvotes                35233 non-null int64
downvotes              35233 non-null int64
rage                   35233 non-null int64
joy                    35233 non-null int64
sadness                35233 non-null int64
surprise               35233 non-null int64
fear                   35233 non-null int64
reactions              35233 non-null int64
reactions_sentiment    29853 non-null float64
dtypes: datetime64[ns](1), float64(1

In [20]:
def create_dict(unique):
    return { key: value for value, key in enumerate(unique) }

In [21]:
division_dict = create_dict(df['div'].unique())

In [22]:
author_dict = create_dict(df['author'].unique())

In [23]:
media_dict = create_dict(df['media_type'].unique())

In [24]:
from json import dump

In [25]:
with open('labeling/division_dict', 'w') as f:
    dump(division_dict, f)

In [26]:
# df['div'] = df['div'].map(division_dict)

In [27]:
with open('labeling/author_dict', 'w') as f:
    dump(author_dict, f)

In [28]:
# df['author'] = df['author'].map(author_dict)

In [29]:
with open('labeling/media_dict', 'w') as f:
    dump(media_dict, f)

In [30]:
# df['media_type'] = df['media_type'].map(media_dict)

In [31]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24247530,dekomunizacja ulic,2018-12-06 13:00:00,Jarosław Osowski,czas wyczekiwany final dekomunizacji ulic piat...,naczelny sad administracyjny rozny finalizuje ...,piatkowej wokandzie nsa figuruje nazw stoleczn...,marzec zakladanie nowych tablic dawnej al armi...,image,7,44,1,0,2,0,0,2,45,0.98
1,24593465,CBA,2019-03-28 13:40:00,mch,cba zatrzymalo pieciu podkarpackich przedsiebi...,rzeszowska delegatura centralnego biura antyko...,mezczyzni dzialali zorganizowanej grupie przes...,cba,image,3,9,0,0,0,0,0,0,9,1.0
2,24376781,Kraj,2019-01-18 14:17:00,Antonio Tajani,swieca pawla wyprzec ciemnosc,stanie pogodzic mowa nienawisci rozprzestrzeni...,kazde morderstwo tragedia pawel adamowicz wied...,prezydent miasta pawel adamowicz zyje,image,28,567,10,0,3,0,0,8,577,0.98


In [32]:
df.to_csv('labeling/fresh_data.csv', index = False)