In [150]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import math
import random


In [108]:
data_set = pd.read_csv('./collectdata/comments_ua.txt', sep = ':::::', names = ["score", "comment"])



  """Entry point for launching an IPython kernel.


In [110]:
data_set.groupby("score").count()

Unnamed: 0_level_0,comment
score,Unnamed: 1_level_1
1,70
2,84
3,171
4,605
5,2196


In [114]:
msk = np.random.rand(len(data_set)) < 0.8
train = data_set[msk]
test = data_set[~msk]

In [172]:
def train_model(**kwargs):
    
    def calc_probabilities_in_class(data):
        bag_of_words = {}
        for index, row in data.iterrows():
            words = nltk.word_tokenize(row['comment'])
            for word in words:
                word_l = word.lower()
                if not word_l in bag_of_words:
                    bag_of_words[word_l] = 1
                else:
                    bag_of_words[word_l] = bag_of_words[word_l] + 1
        return bag_of_words
    
    probabilities = {}
    for key in kwargs:
        probabilities[key] = calc_probabilities_in_class(kwargs[key])
    return probabilities


def predict_model_comment(model, comment):
    
    def comment_log_probabitlity(words, class_model, total_words, total_unique_words):
        denominator = sum(class_model.values()) + total_unique_words
        log_prob = math.log(sum(class_model.values()) / total_words)
        for word in words:
            n = class_model[word] if word in class_model else 0 
            log_prob += math.log((n + 1)/denominator)
        return log_prob
        
    log_probablitites = {}
    words = nltk.word_tokenize(comment)
    
    total_words = 0
    total_unique_words = 0
    for key in model:
        total_words += sum(model[key].values())
        total_unique_words += len(model[key])
        
    for key in model:
        log_probablitites[key] = comment_log_probabitlity(words, model[key], total_words, total_unique_words)
    return log_probablitites

def print_report(true_positive, true_negative, false_positive, false_negative):
    print("True Positive:", true_positive, "; True negative:", 
          true_negative, "; False positive:", false_positive, "; False negative:", false_negative)
    print("Presision:", true_positive/(true_positive + false_positive),
          "; Recall:", true_positive/(true_positive + false_negative))

def predict_on_set(predict_comment_lambda, test_set, labeled_negative):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for index, row in test_set.iterrows():
        max_prob = float('-inf')
        max_key = ""
        probs = predict_comment_lambda(row["comment"])
        for key_class in probs:
            if probs[key_class] > max_prob:
                max_prob = probs[key_class]
                max_key = key_class
        is_negative_eval = max_key == 'negative'
        is_negative_label = labeled_negative(row["score"]) 
        #print("is_negative_eval:", is_negative_eval, " is_neg_label:", is_negative_label)
      
        if is_negative_eval and is_negative_label:
            true_negative += 1
            
        if not is_negative_eval and not is_negative_label:
            true_positive += 1
            
        if not is_negative_eval and is_negative_label:
            false_positive += 1
            
        if is_negative_eval and not is_negative_label:
            false_negative += 1
        
    return print_report(true_positive, true_negative, false_positive, false_negative)
        
def predict_random_comment(train_data, comment):
    total_positive = train_data[train_data["score"] > 3]["comment"].count()
    positive_prob = total_positive / train_data["comment"].count()

    if random.random() > positive_prob:
        return {"positive": 0, "negative": 1}
    return {"positive": 1, "negative": 0}

Check pure model (no rules or sentiment dictionary) and compare it with random

In [173]:
model = train_model(positive = train[train["score"] > 3], negative = train[train["score"] <= 3])
print("For pure model:")
predict_on_set(lambda comment: predict_model_comment(model, comment), test, lambda x: x <= 3)
print("For random:")
predict_on_set(lambda comment: predict_random_comment(train, comment), test, lambda x: x <= 3)

For pure model:
True Positive: 506 ; True negative: 16 ; False positive: 48 ; False negative: 48
Presision: 0.9133574007220217 ; Recall: 0.9133574007220217
For random:
True Positive: 495 ; True negative: 8 ; False positive: 56 ; False negative: 59
Presision: 0.8983666061705989 ; Recall: 0.8935018050541517


In [None]:
# check classifier from nltk
all_words = set(word.lower() for index,row in train.iterrows() for word in nltk.word_tokenize(row["comment"]))
train_as_words = [({word: (word in nltk.word_tokenize(row["comment"])) for word in all_words},
                   "negative" if row["score"] <= 3 else "positive")  for index,row in train.iterrows()]
print(train_as_words[0:10])
classifier = nltk.NaiveBayesClassifier.train(train_as_words)
classifier.show_most_informative_features()
    
def prdic_by_nltk(comment):
    global all_words
    global classifier
    test_sent_features = {word.lower(): (word in nltk.word_tokenize(comment.lower())) for word in all_words}
    label = classifier.classify(test_sent_features)
    return {"positive": (1 if label == "positive" else 0), "negative": (1 if label == "negative" else 0)}

predict_on_set(prdic_by_nltk, test, lambda x: x <= 3)

In [181]:
tonal_dict = pd.read_csv('../../../../sources/tone-dict-uk.tsv', sep = '\t', names = ["word", "sentiment"])

def predict_comment_with_tonal(model, comment):
    
    def sentiment_score():
        positive = 0
        negative = 0
        for index, row in tonal_dict.iterrows():
            if row["word"] in comment:
                if int(row["sentiment"]) > 0:
                    positive += int(row["sentiment"])
                else:
                    negative += int(row["sentiment"])
        return positive if positive > -negative else negative
            
    score = sentiment_score()
   
    if score > 0:
        return {"positive": 1, "negative":0}
    if score < 0:
        return {"negative": 0, "positive":1}
    
    return predict_model_comment(model, comment)
    
predict_on_set(lambda comment: predict_comment_with_tonal(model, comment), test, lambda x: x <= 3)


True Positive: 520 ; True negative: 9 ; False positive: 55 ; False negative: 34
Presision: 0.9043478260869565 ; Recall: 0.9386281588447654
365


In [None]:
stop_words = ['і', 'на', '.', ',', 'в', '(', ')', 'а', 'за', 'у']
for key in model['negative']:
    print(key)