# Multinomial naive bayes model

## 1. Prepocessing

In [26]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np

import io

label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"}
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3}

emoticons_additional = {
    '(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>',
    ':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>',
    ':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':":
        '<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>',
}

text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    fix_html=True,
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons, emoticons_additional]
)


def tokenize(text):
    text = " ".join(text_processor.pre_process_doc(text))
    return text


def preprocessData(dataFilePath, mode):
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            line = line.strip().split('\t')
            for i in range(1, 4):
                line[i] = tokenize(line[i])
            if mode == "train":
                labels.append(emotion2label[line[4]])
            conv = line[1:4]
            conversations.append(' '.join(conv))
    if mode == "train":
        return np.array(conversations), np.array(labels)
    else:
        return np.array(conversations)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [27]:
texts_train, labels_train = preprocessData('..\\data\\train.txt', mode="train")
texts_dev, labels_dev = preprocessData('..\\data\\dev.txt', mode="train")
texts_test, labels_test = preprocessData('..\\data\\test.txt', mode="train")

## 2. Vectorizing

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(texts_train)
x_test = vectorizer.transform(texts_test)

## 3. Naive Bayes classifying

In [29]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
y_train = labels_train
clf.fit(x_train, y_train)

## 4. Metrics

In [30]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = clf.predict(x_test)
y_true = labels_test
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("f1 weighted:", f1)

Accuracy: 0.7879833000544564
f1 weighted: 0.8094027727100904
