In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plttokenize
import spacy
import nltk
from spacy.tokenizer import Tokenizer
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [2]:
data = pd.read_csv("../ml/data/ISEAR.csv",
                  names=["index", "emotion", "sentence"],
                  index_col="index")

In [3]:
data.head()

Unnamed: 0_level_0,emotion,sentence
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [4]:
data.emotion.value_counts()

joy        1082
sadness    1074
anger      1069
fear       1063
shame      1059
disgust    1059
guilt      1040
Name: emotion, dtype: int64

In [6]:
classes = data.emotion.unique()
le = preprocessing.LabelEncoder()
le.fit(classes)
labels = le.transform(data.emotion)

In [None]:
def get_emotion(label):
    return le.inverse_transform([label])[0]

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
prefix_re = spacy.util.compile_prefix_regex(
    nlp.Defaults.prefixes)

suffix_re = spacy.util.compile_suffix_regex(
    nlp.Defaults.suffixes)

infix_re = re.compile(r'''[-~]''')

tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None
                     )

In [None]:
data['tokens'] = data['sentence'].apply(tokenizer)
data.head()

In [None]:
def return_list(tokens):
    return [x.text for x in tokens]

In [None]:
data['tokens'] = data['tokens'].apply(return_list)

In [None]:
def create_vocab(sentence):
    vocab = []
    for i in range(sentence.shape[0]):
        vocab.extend(sentence[i])
    return set(vocab)
    
vocab = create_vocab(data.tokens)

In [None]:
from nltk.classify import NaiveBayesClassifier

In [None]:
from nltk import ngrams, everygrams
def create_ngram_features(words, n=2):
#     ngram_vocab = ngrams(words, n)
    ngram_vocab = everygrams(words, 1, n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])
    return my_dict

for n in range(1,6):
    pos_data = []
    for i in range(data.shape[0]):
        words = data.tokens[i]
        pos_data.append((create_ngram_features(words, n), data.emotion[i]))  
        train_set = pos_data[:5000]
        test_set = pos_data[5000:]

    classifier = NaiveBayesClassifier.train(train_set)

    accuracy = nltk.classify.util.accuracy(classifier, test_set)
    print(str(n)+'-gram accuracy:', accuracy)