In [1]:
# author, division, idf from raw comments
# and labels like {comment_id: {"rage": 2, "replies": 3 ...}}
# get the most frequent words as well to be able to filter word2vec file

# out files:
# - author_dict
# - division_dict
# - idf_dict

# - file with x and y 

In [2]:
from os import listdir
import pandas as pd
from labeling.emotions import rage, joy, sadness, surprise, fear

In [3]:
months = {
    'stycznia': 1,
    'lutego': 2,
    'marca': 3,
    'kwietnia': 4,
    'maja': 5,
    'czerwca': 6,
    'lipca': 7,
    'sierpnia': 8,
    'września': 9,
    'października': 10,
    'listopada': 11,
    'grudnia': 12
}

In [4]:
def process_date(months, date):
    year = date.split('|')[0].split(' ')[2].strip()
    month = str(months[date.split('|')[0].split(' ')[1].strip()])
    day = date.split('|')[0].split(' ')[0].strip()
    
    try:
        hour = date.split('|')[1].strip()
        return pd.to_datetime(" ".join([year, month, day, hour]))
    except:
        return None
    
    

In [5]:
from re import search, sub
from unidecode import unidecode

In [6]:
rage_regex = "|".join(rage)
joy_regex = "|".join(joy)
sadness_regex = "|".join(sadness)
surprise_regex = "|".join(surprise)
fear_regex = "|".join(fear)

In [7]:
def find_emotions(emotions, body):
    for i, emotion_regex in enumerate([rage_regex, joy_regex, sadness_regex, surprise_regex, fear_regex]):
        if search(emotion_regex, body):
            emotions[i] += 1
    return emotions

In [8]:
def label(comments):
    replies = 0

    upvotes = 0
    downvotes = 0

    emotions = [0,0,0,0,0]
    for comment in a['comments']:
        main = comment['main_comment']

        upvotes += int(main['upvotes'])
        downvotes += int(main['downvotes'])
        replies += 1

        emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
        for subcomment in comment['sub_comments']:
            try:
                upvotes += int(subcomment['upvotes'])
                downvotes += int(subcomment['downvotes'])
                replies += 1
            except:
                return None
            emotions = find_emotions(emotions, sub(r"[^a-z| ]", "", unidecode(main['body']).lower()))
    return [replies, upvotes, downvotes] + [emotion for emotion in emotions]

In [9]:
def remove_specials(body):
    return sub(r"[^a-z| ]", "", unidecode(body).lower())

In [10]:
with open('data/polish.stopwords.txt', 'r') as f:
    stop_words = f.read().split('\n')

In [11]:
def remove_stop_words(body):
    return [word for word in body.split(' ') if word not in stop_words and len(word) > 1]

In [12]:
def clean_words(body):
    return " ".join(remove_stop_words(remove_specials(body)))

In [13]:
all_articles =[]
for article in listdir('data/comments/'):
    with open('data/comments/'+article) as f:
        articles = []
        a = eval(f.read())['row']['comments']
        if a['comments'] != []:
        
            articles.append(article)
            articles.append(a['division'])
            articles.append(process_date(months, a['pub_date']))
            articles.append(a['author'].split(',')[0])

            articles.append(clean_words(a['title']))
            articles.append(clean_words(a['highlight']))
            articles.append(clean_words(a['content']))
            articles.append(clean_words(a['media_desc']))

            articles.append(a['media_type'])
            labels = label(a['comments'])
            if labels:
                for return_label in labels:
                    articles.append(return_label)
                all_articles.append(articles)

In [14]:
df = pd.DataFrame(all_articles, columns = [
    'id', 'div', 'date', 'author', 'title', 
    'highlight', 'content', 'media_desc', 'media_type',
    'replies', 'upvotes', 'downvotes', 
    'rage', 'joy', 'sadness', 'surprise', 'fear'
])
df['reactions'] = df['upvotes'] + df['downvotes']
df['reactions_sentiment'] = round(df['upvotes'] / df['reactions'], 2)

In [15]:
len(df)

14412

In [16]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24162636,Wiadomości z Poznania,2018-11-13 12:31:00,Tomasz Cylka,poznaniu powroci ul lutego wojewoda zbigniew h...,ostateczna decyzja nazwa ul lutego wrocic mape...,rok temu wojewoda wielkopolski zbigniew hoffma...,poznaniu najwieksze kontrowersje wzbudzila zmi...,image,10,123,0,0,1,0,0,0,123,1.0
1,24573799,brexit,2019-03-22 10:02:00,Tomasz Bielecki,merkel macrona chcesz zebysmy przeszli histori...,ue odsunie brexit maja izba gmin poprze umowe ...,szef rady europejskiej donald tusk oglosil dec...,brexitowy uber jada brytyjczycy,video,98,655,297,0,0,2,0,0,952,0.69
2,24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,cyrk zwierzetami gdansku mieszkancy oburzeni b...,mieszkancy osowej chca oknami wystepowal cyrk ...,wtorek gdansku wystapil cyrk zalewski udzialem...,,image,5,51,9,0,0,0,0,0,60,0.85


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14412 entries, 0 to 14411
Data columns (total 19 columns):
id                     14412 non-null object
div                    14412 non-null object
date                   14411 non-null datetime64[ns]
author                 14412 non-null object
title                  14412 non-null object
highlight              14412 non-null object
content                14412 non-null object
media_desc             14412 non-null object
media_type             14412 non-null object
replies                14412 non-null int64
upvotes                14412 non-null int64
downvotes              14412 non-null int64
rage                   14412 non-null int64
joy                    14412 non-null int64
sadness                14412 non-null int64
surprise               14412 non-null int64
fear                   14412 non-null int64
reactions              14412 non-null int64
reactions_sentiment    14131 non-null float64
dtypes: datetime64[ns](1), float64(1

In [18]:
def create_dict(unique):
    return { key: value for value, key in enumerate(unique) }

In [19]:
division_dict = create_dict(df['div'].unique())

In [20]:
author_dict = create_dict(df['author'].unique())

In [21]:
media_dict = create_dict(df['media_type'].unique())

In [22]:
from json import dump

In [23]:
with open('labeling/division_dict', 'w') as f:
    dump(division_dict, f)

In [24]:
# df['div'] = df['div'].map(division_dict)

In [25]:
with open('labeling/author_dict', 'w') as f:
    dump(author_dict, f)

In [26]:
# df['author'] = df['author'].map(author_dict)

In [27]:
with open('labeling/media_dict', 'w') as f:
    dump(media_dict, f)

In [28]:
# df['media_type'] = df['media_type'].map(media_dict)

In [29]:
df.head(3)

Unnamed: 0,id,div,date,author,title,highlight,content,media_desc,media_type,replies,upvotes,downvotes,rage,joy,sadness,surprise,fear,reactions,reactions_sentiment
0,24162636,Wiadomości z Poznania,2018-11-13 12:31:00,Tomasz Cylka,poznaniu powroci ul lutego wojewoda zbigniew h...,ostateczna decyzja nazwa ul lutego wrocic mape...,rok temu wojewoda wielkopolski zbigniew hoffma...,poznaniu najwieksze kontrowersje wzbudzila zmi...,image,10,123,0,0,1,0,0,0,123,1.0
1,24573799,brexit,2019-03-22 10:02:00,Tomasz Bielecki,merkel macrona chcesz zebysmy przeszli histori...,ue odsunie brexit maja izba gmin poprze umowe ...,szef rady europejskiej donald tusk oglosil dec...,brexitowy uber jada brytyjczycy,video,98,655,297,0,0,2,0,0,952,0.69
2,24760624,cyrk,2019-05-08 07:02:00,Anna Dobiegała,cyrk zwierzetami gdansku mieszkancy oburzeni b...,mieszkancy osowej chca oknami wystepowal cyrk ...,wtorek gdansku wystapil cyrk zalewski udzialem...,,image,5,51,9,0,0,0,0,0,60,0.85


In [30]:
df.to_csv('labeling/fresh_data.csv', index = False)