In [13]:
import csv
import pandas as pd
from rutermextract import TermExtractor
from pymystem3 import Mystem
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
import fasttext

mystem = Mystem()
term_extractor = TermExtractor()

all_c = 'comments_all.csv'

In [14]:
def read_and_prepare(file):
    comments_file = open(file)
    comments = csv.reader(comments_file, delimiter=';')
    tpl = []

    for i in comments:
        tpl.append(i)

    flat_list = []
    for sublist in tpl:
        for item in sublist:
            if item != ' ' and item != '':
                flat_list.append(item)

    comments_file.close()

    return flat_list

In [15]:
def trans_num_into_sent(number):
    if -2 <= float(number) <= 2:
        return 'neutral'
    elif float(number) > 2:
        return 'positive'
    else:
        return 'negative'

In [16]:
user_comments = read_and_prepare(all_c)

## Whole corpus' keywords and key phrases extraction

In [17]:
comments_as_text = '. '.join(user_comments)
comments_as_text_lemm = mystem.lemmatize(comments_as_text)
comments_as_text_lemm = ''.join(comments_as_text_lemm)

In [18]:
terms_lemmas = []
terms_count = []
for term in term_extractor(comments_as_text_lemm, nested=True):
    terms_lemmas.append(str(term.normalized))
    terms_count.append(int(term.count))

terms_sentiments_score = [float(0)] * len(terms_lemmas)
terms_sentiments = [None] * len(terms_lemmas)

df = pd.DataFrame(data={'terms': terms_lemmas, 'terms_count': terms_count, 'terms_sentiments_score': terms_sentiments_score, 'terms_sentiments': terms_sentiments})

## Calculation of sentiments for each keyword using Dostoevsky

In [19]:
fasttext.FastText.eprint = lambda x: None
tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)

sentiments = model.predict(user_comments, k=1)

In [20]:
for i in range(0, len(user_comments)):
    comment_terms = mystem.lemmatize(user_comments[i])
    comment_terms = ''.join(comment_terms)
    comment_terms = term_extractor(comment_terms, nested=True)

    for c_term in comment_terms:
        if c_term.normalized in df['terms'].values:
            if str(list(sentiments[i].keys())[0]) != 'neutral' and str(list(sentiments[i].keys())[0]) != 'skip' and str(list(sentiments[i].keys())[0]) != 'speech':
                index = int(df[df['terms']==c_term.normalized].index.item())
                senti = float(df['terms_sentiments_score'].values[index])

                if str(list(sentiments[i].keys())[0]) == 'positive':
                    senti += float(list(sentiments[i].values())[0])
                    senti = round(senti, 3)
                if str(list(sentiments[i].keys())[0]) == 'negative':
                    senti -= float(list(sentiments[i].values())[0])
                    senti = round(senti, 3)

                df.at[index, 'terms_sentiments_score'] = senti

In [None]:
df['terms_sentiments'] = df['terms_sentiments_score'].apply(trans_num_into_sent)
print(df)

In [22]:
df.to_csv('terms_all_c_dostoevsky.csv', encoding='utf-8', index=False)