In [198]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import math
import random
from ukr_stemmer3 import UkrainianStemmer

In [25]:
data_set = pd.read_csv('./collectdata/comments_ua.txt', sep = ':::::', names = ["score", "comment"])



  """Entry point for launching an IPython kernel.


In [None]:
data_set.groupby("score").count()
data_set.count()

In [4]:
msk = np.random.rand(len(data_set)) < 0.8
train = data_set[msk]
test = data_set[~msk]

In [196]:
def to_base_form(use_lemma, word):
    if (use_lemma):
        return UkrainianStemmer(word.lower()).stem_word()
    return word.lower()

def train_model(stop_words=[], use_lemma=False, **kwargs):
    
    def calc_probabilities_in_class(data):
        bag_of_words = {}
        for index, row in data.iterrows():
            words = nltk.word_tokenize(row['comment'])
            for word in words:
                base_form = to_base_form(use_lemma, word)
                if not base_form in stop_words:
                    if not base_form in bag_of_words:
                        bag_of_words[base_form] = 1
                    else:
                        bag_of_words[base_form] = bag_of_words[base_form] + 1
        return bag_of_words
    
    probabilities = {}
    for key in kwargs:
        probabilities[key] = calc_probabilities_in_class(kwargs[key])
    return probabilities


def predict_model_comment(model, comment, stop_words = [], use_lemma = False):
    
    def comment_log_probabitlity(words, class_model, total_words, total_unique_words):
        denominator = sum(class_model.values()) + total_unique_words
        log_prob = math.log(sum(class_model.values()) / total_words)
        for word in words:
            n = class_model[word] if word in class_model else 0 
            log_prob += math.log((n + 1)/denominator)
        return log_prob
        
    log_probablitites = {}
    words = [to_base_form(use_lemma, word) for word in nltk.word_tokenize(comment) if word.lower not in stop_words]
    
    total_words = 0
    total_unique_words = 0
    for key in model:
        total_words += sum(model[key].values())
        total_unique_words += len(model[key])
        
    for key in model:
        log_probablitites[key] = comment_log_probabitlity(words, model[key], total_words, total_unique_words)
    return log_probablitites

def print_report(true_positive, true_negative, false_positive, false_negative):
    print("True Positive:", true_positive, "; True negative:", 
          true_negative, "; False positive:", false_positive, "; False negative:", false_negative)
    print("Preсision:", true_positive/(true_positive + false_positive),
          "; Recall:", true_positive/(true_positive + false_negative), "\n")

def predict_on_set(predict_comment_lambda, test_set, labeled_negative):
    
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for index, row in test_set.iterrows():
        max_prob = float('-inf')
        max_key = ""
        probs = predict_comment_lambda(row["comment"])
        for key_class in probs:
            if probs[key_class] > max_prob:
                max_prob = probs[key_class]
                max_key = key_class
        is_negative_eval = max_key == 'negative'
        is_negative_label = labeled_negative(row["score"]) 
        #print("is_negative_eval:", is_negative_eval, " is_neg_label:", is_negative_label)
      
        if is_negative_eval and is_negative_label:
            true_negative += 1
            
        if not is_negative_eval and not is_negative_label:
            true_positive += 1
            
        if not is_negative_eval and is_negative_label:
            false_positive += 1
            
        if is_negative_eval and not is_negative_label:
            false_negative += 1
        
    return print_report(true_positive, true_negative, false_positive, false_negative)
        
def predict_random_comment(train_data, comment):
    total_positive = train_data[train_data["score"] > 3]["comment"].count()
    positive_prob = total_positive / train_data["comment"].count()

    if random.random() > positive_prob:
        return {"positive": 0, "negative": 1}
    return {"positive": 1, "negative": 0}

Check pure model (no rules or sentiment dictionary) and compare it with random

In [193]:
model = train_model(positive = train[train["score"] > 3], negative = train[train["score"] <= 3])
print("For pure model on test(just to check that algorythm is corect):")
predict_on_set(lambda comment: predict_model_comment(model, comment), train, lambda x: x <= 3)
print("For pure model on test:")
predict_on_set(lambda comment: predict_model_comment(model, comment), test, lambda x: x <= 3)
print("For random:")
predict_on_set(lambda comment: predict_random_comment(train, comment), test, lambda x: x <= 3)

For pure model on test(just to check that algorythm is corect):
True Positive: 2202 ; True negative: 156 ; False positive: 105 ; False negative: 45
Presision: 0.9544863459037711 ; Recall: 0.9799732977303071
For pure model on test:
True Positive: 506 ; True negative: 16 ; False positive: 48 ; False negative: 48
Presision: 0.9133574007220217 ; Recall: 0.9133574007220217
For random:
True Positive: 488 ; True negative: 2 ; False positive: 62 ; False negative: 66
Presision: 0.8872727272727273 ; Recall: 0.8808664259927798


In [229]:
# check classifier from nltk
all_words = set(word.lower() for index,row in train.iterrows() for word in nltk.word_tokenize(row["comment"]))
train_as_words = [({word: (word in nltk.word_tokenize(row["comment"])) for word in all_words},
                   "negative" if row["score"] <= 3 else "positive")  for index,row in train.iterrows()]

classifier = nltk.NaiveBayesClassifier.train(train_as_words)
classifier.show_most_informative_features()
    
def prdic_by_nltk(comment):
    global all_words
    global classifier
    test_sent_features = {word.lower(): (word in nltk.word_tokenize(comment.lower())) for word in all_words}
    label = classifier.classify(test_sent_features)
    return {"positive": (1 if label == "positive" else 0), "negative": (1 if label == "negative" else 0)}

predict_on_set(prdic_by_nltk, test, lambda x: x <= 3)

KeyboardInterrupt: 

In [199]:
tonal_dict = pd.read_csv('../../../../sources/tone-dict-uk.tsv', sep = '\t', names = ["word", "sentiment"])

def predict_comment_with_tonal(model, comment, skip_contradict = False):
    
    def sentiment_score():
        positive = 0
        negative = 0
        for index, row in tonal_dict.iterrows():
            if to_base_form(True, row["word"]) in comment:
                if int(row["sentiment"]) > 0:
                    positive += int(row["sentiment"])
                else:
                    negative += int(row["sentiment"])
        #use only if there is no contradictions
        if skip_contradict and positive != 0 and negative != 0:
            return 0
        return positive if positive > -negative else negative
            
    score = sentiment_score()
   
    if score > 0:
        return {"positive": 1, "negative":0}
    if score < 0:
        return {"negative": 0, "positive":1}
    
    return predict_model_comment(model, comment)

print("Test for use sentiment dictionary (if both + and - select lareger module)")  
predict_on_set(lambda comment: predict_comment_with_tonal(model, comment), test, lambda x: x <= 3)

print("Test for use sentiment dictionary (if both + and - skip)")  
predict_on_set(lambda comment: predict_comment_with_tonal(model, comment, True), test, lambda x: x <= 3)


Test for use sentiment dictionary (if both + and - select lareger module)
True Positive: 549 ; True negative: 2 ; False positive: 62 ; False negative: 5
Presision: 0.8985270049099836 ; Recall: 0.9909747292418772 

Test for use sentiment dictionary (if both + and - skip)
True Positive: 540 ; True negative: 3 ; False positive: 61 ; False negative: 14
Presision: 0.8985024958402662 ; Recall: 0.9747292418772563 



In [217]:
def get_stop_words(model, tonal_dict):
    def is_tonal(word):
        for index, row in tonal_dict.iterrows():
            if to_base_form(True, row["word"]) in word:
                return True
        return False
                
    stop_words = []
    for word, rank in sorted(model["positive"].items(), key=lambda kv: kv[1], reverse=True):
        if rank > 100 and not is_tonal(word):
            stop_words.append(word)
    return stop_words
           

In [224]:
stop_words = get_stop_words(model, tonal_dict)
print(stop_words)

['.', ',', 'на', 'не', 'і', '!', 'в', ')', 'з', 'для', 'за', '-', 'що', 'як', 'але', 'все', '(', 'та', 'у', 'до', '?', 'це', 'а', 'я', 'від', 'можна', '%', 'по', ':', '2', 'рекомендую', 'так', 'ще', 'є', 'вже', 'то']


In [225]:
print(to_base_form(True, 'рекомендувати'))
stop_words.remove('рекомендую')
print(stop_words)

рекомендув
['.', ',', 'на', 'не', 'і', '!', 'в', ')', 'з', 'для', 'за', '-', 'що', 'як', 'але', 'все', '(', 'та', 'у', 'до', '?', 'це', 'а', 'я', 'від', 'можна', '%', 'по', ':', '2', 'так', 'ще', 'є', 'вже', 'то']


In [228]:

model_ext = train_model(stop_words, True, positive = train[train["score"] > 3], negative = train[train["score"] <= 3])
print("For train with stop words and stemming:")
predict_on_set(lambda comment: predict_model_comment(model_ext, comment, stop_words, True), test, lambda x: x <= 3)


For train with stop words and stemming:
True Positive: 323 ; True negative: 43 ; False positive: 21 ; False negative: 231
Presision: 0.938953488372093 ; Recall: 0.5830324909747292 

