In [5]:
import json
import numpy as np
from pprint import pprint
from collections import namedtuple
from random import shuffle
with open('./data.json', encoding='utf-8') as f:
    content = f.readlines()
data = [json.loads(line, object_hook=lambda d: namedtuple('X', d.keys())(*d.values())) for line in content]
shuffle(data)

In [6]:
extracted_data = [(int(x.grade),x.text) for x in data]
divider = int(round(0.7 * len(extracted_data)))
learn_data = extracted_data[:divider]
test_data = extracted_data[divider:]

In [7]:
def convert_grade(grade):
    if grade == 3 : return 0
    if grade > 3 : return 1
    if grade < 3 : return -1

def process_data(data):
    texts = []
    grades = []
    for grade,text in data:
        texts.append(text)
        grades.append(convert_grade(grade))
    return texts, grades

texts,grades = process_data(learn_data)
test_texts,test_grades = process_data(test_data)

In [8]:
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [9]:
cv = CountVectorizer()
X_data = cv.fit_transform(texts).toarray()
cNB = GaussianNB().fit(X_data,grades)
X_test_data = cv.transform(test_texts).toarray()
predicted_n = cNB.predict(X_test_data)
print (classification_report(test_grades, predicted_n))

             precision    recall  f1-score   support

         -1       0.09      0.07      0.08        27
          0       0.07      0.05      0.06        56
          1       0.87      0.90      0.89       575

avg / total       0.77      0.79      0.78       658



In [10]:
import pandas as pd
vocab = pd.read_csv("https://raw.githubusercontent.com/lang-uk/tone-dict-uk/master/tone-dict-uk-manual.tsv",
                    header=None, usecols=[0,1], encoding='utf-8', sep='\t')
vocab = set([(x,abs(y)) for x,y in zip(vocab[0].tolist(),vocab[1].tolist()) if y != 0])
sent_dict = dict(vocab)

In [19]:
count_vect = CountVectorizer()
X_words = count_vect.fit_transform(texts).toarray()

In [20]:
features = count_vect.get_feature_names()

In [24]:
def update_features(words_count_array, features, sent_dict):
    for row in words_count_array:
        for ind in range(0,len(features)):
            sent_coeff = sent_dict.get(features[ind], None)
            if sent_coeff != None:
                row[ind] = row[ind] * sent_coeff * 100

update_features(X_words, features, sent_dict)

In [25]:
clf = GaussianNB().fit(X_words, grades)

In [26]:
X_test_words = count_vect.transform(test_texts).toarray()
update_features(X_test_words, features, sent_dict)
predicted_3 = clf.predict(X_test_words)
print (classification_report(test_grades, predicted_3))

             precision    recall  f1-score   support

         -1       0.07      0.07      0.07        27
          0       0.08      0.75      0.15        56
          1       0.91      0.18      0.31       575

avg / total       0.81      0.23      0.28       658



In [27]:
from sklearn.svm import LinearSVC
linearSVC_pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LinearSVC())])
linearSVC_pipeline.fit(texts, grades)
predicted_linear = linearSVC_pipeline.predict(test_texts)
print (classification_report(test_grades, predicted_linear))

             precision    recall  f1-score   support

         -1       0.33      0.22      0.27        27
          0       0.35      0.11      0.16        56
          1       0.90      0.97      0.93       575

avg / total       0.83      0.87      0.84       658



In [39]:
from sklearn.svm import LinearSVC
linearSVC_alt_pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1, 3), analyzer='word')), ('classifier', LinearSVC())])
linearSVC_alt_pipeline.fit(texts, grades)
predicted_alt_linear = linearSVC_alt_pipeline.predict(test_texts)
print (classification_report(test_grades, predicted_alt_linear))

             precision    recall  f1-score   support

         -1       0.14      0.04      0.06        27
          0       0.29      0.04      0.06        56
          1       0.88      0.99      0.93       575

avg / total       0.80      0.87      0.82       658



In [53]:
def process_data_balanced(data):
    texts = []
    grades = []
    for grade,text in data:
        converted_g = convert_grade(grade)
        texts.append(text)
        grades.append(converted_g)
        if converted_g < 1:
            for i in range(10):
                texts.append(text)
                grades.append(converted_g)
    return texts, grades
texts_b, grades_b = process_data_balanced(learn_data)

In [57]:
linearSVC_pipeline.fit(texts_b, grades_b)
predicted_linear = linearSVC_pipeline.predict(test_texts)
print (classification_report(test_grades, predicted_linear))

             precision    recall  f1-score   support

         -1       0.28      0.19      0.22        27
          0       0.29      0.12      0.18        56
          1       0.90      0.97      0.93       575

avg / total       0.82      0.86      0.84       658

