# Emotion Classification NLP

In [None]:
from pprint import pprint
!pip install emoji

# Setup
import os
import re
import time

import emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import sklearn
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_is_fitted

%matplotlib inline
sns.set_style('white')
sns.set_context('notebook')

random_state = 143

## Frame the problem and look at the big picture
From the information in several texts to determine the **emotion** that each of these convey.

* How will your solution be used?
It will be used to categorise other tweets and other trending information regarding a specific
topic to determine the emotion behind.

* What are the current solutions/workarounds (if any)?
Current solutions use natural language processing by creating, or using readily available, embeddings
of the words. These embeddings are vectors that represent several *dimensions* of the word and allow them
to be compared with each other. Words with similar embeddings tend to have a similar meaning or used
in similar contexts. (Similar work)

* How should you frame this problem (supervised/unsupervised, online/offline, etc.)
Data is classified, so a supervised algorithm might be better suited. Embeddings can be tested using
words within the data or using other databases in english.

* How should performance be measured?
Performance is measured in accuracy, though ideally it should be able to output the probability of it
being from several classes (e.g. happy and angry).

## Data loading

In [None]:
def load_emotion_data(dataset):
    """
    Loads the required dataset from the emotions input
    :param dataset:
    :return:
    """
    base_path = '../input/emotion-classification-nlp/'
    dataset_name = 'emotion-labels-%s.csv' % dataset

    data = pd.read_csv(os.path.join(base_path, dataset_name))
    return data['text'].values, data['label'].values

train_X, train_y = load_emotion_data('train')
val_X, val_y = load_emotion_data('val')
test_X, test_y = load_emotion_data('test')



print(train_X)

## Data Exploration

Columns in the text

| Column name | Description |
|-|-|
| text | Piece of text |
| label | Assigned emotion to the text |

In [None]:
# Labels
sns.displot(train_y, height=5, aspect=2)
plt.title('Ratio of different emotion labels in text')
plt.show()


In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_X)

vocabulary = pd.DataFrame(vectorizer.vocabulary_.items(), columns=['word', 'count'])
vocabulary['length'] = vocabulary.apply(lambda x: len(x[0]), axis=1)


print(vocabulary.head())

plt.figure(figsize=(10, 10))
sns.scatterplot(data=vocabulary, x='length', y='count')
plt.show()

## Baseline
Do a simple ML approach to see how much accuracy we can get.

In [None]:
def test_classifiers(classifiers, X, y, output=True):
    best_classifier = None
    best_score = 0
    for _clf in classifiers:
        try:
            clf = sklearn.clone(_clf)
            start = time.time()
            scores = cross_val_score(clf, X, y)

            if output:
                print('Using classifier:', clf)
                print('\ttime:', time.time() - start)
                print('\tscores:', scores)
                print('\tbest:', np.max(scores))
                print('\taverage:', np.average(scores))

            max_score = np.average(scores)
            if max_score > best_score:
                best_score = max_score
                best_classifier = clf
        except Exception as e:
            print('** FAILED ** classifier', _clf)
            print(e)

    print('Fitting best classifier')
    best_classifier.fit(X, y)
    y_pred = best_classifier.predict(X)

    conf_matrix = confusion_matrix(y, y_pred, normalize='true')

    plt.figure(figsize=(10, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='.4f')
    plt.show()

    return best_classifier, best_score


classifiers = [
    KNeighborsClassifier(),
    LinearSVC(max_iter=10000, random_state=random_state),
    RandomForestClassifier(random_state=random_state, n_jobs=8),
    # MLPClassifier(random_state=random_state), # Takes a long time
]
prep_pipeline = Pipeline([
    ('vectorizer', CountVectorizer())
])

train_X_transformed = prep_pipeline.fit_transform(train_X)
print(train_X[0])
print(train_X_transformed[0])

best_clf, best_score = test_classifiers(classifiers, train_X_transformed, train_y)
print('Best classifier is:', best_clf, best_score)

## Text tokenizer comparison

The CountTokenizer creates vector per phrase with n columns, where n is the amount of different words
found in the text corpus. On the other hand, Keras Tokenizer converts each sentence to the numbers that
represent the word in the dictionary, maintaining the order of the words.

In [None]:
# Test using Keras tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(train_X)

train_X_transformed = tokenizer.texts_to_sequences(train_X)
train_X_padded = pad_sequences(train_X_transformed)
print(train_X_padded)

print(train_X[0])
print(train_X_transformed[0])

best_clf, best_score = test_classifiers(classifiers, train_X_padded, train_y)
print('Best classifier is:', best_clf, best_score)

## Preprocessing

While it didn't give better results, it would be interesting to check with other classifiers
or go directly with an RNN. Before that I'll be trying several preprocessing techniques
to see if filtering the text might help. Among these are:
- Removing handles (@name) as I don't want the algorithm to associate a person with sentiment.
- Remove coded values (e.g. &amp;).
- Convert emojis to tokens (as those are good representation of sentiment yet are filtered by the
tokenizers).
- Review if there is other extraneous input like URL's or other

### Remove handles

In [None]:
class RemoveHandles(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        values = pd.Series(X)
        return values.replace(r'@\w+', '', regex=True).values

remove_handles = RemoveHandles()
print(remove_handles.fit_transform(train_X[:10]))

### Remove coded value

In [None]:
class RemoveCodedValues(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        values = pd.Series(X)
        return values.replace(r'&\w+;', '', regex=True).values

remove_coded = RemoveCodedValues()

pat = re.compile(r'&\w+;')
for f in remove_coded.fit_transform(train_X):
    found = pat.findall(f)
    if len(found) > 0:
        print(pat.findall(f))

### Convert emojis to tokens

In [None]:
class TokenizeEmoji(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([emoji.demojize(text) for text in X])

tokenize_emoji = TokenizeEmoji()
test = tokenize_emoji.fit_transform(train_X[:10])

vectorizer.fit(test)
print(vectorizer.vocabulary_)


### Remove URLs or other input

In [None]:
class Emojifier(TransformerMixin, BaseEstimator):
    """
    Converts characters like :) :( :/ to a unique value
    """
    def __init__(self, emoji_pattern=r'[:;Xx][)(\/D]|[)(\/D][:;]'):
        self.emoji_pattern = emoji_pattern

    def fit(self, X, y=None):
        emoji_list = set()
        pattern = re.compile(self.emoji_pattern)

        for line in X:
            emoji_list.update(pattern.findall(line))


        self.found_emojis_ = {}
        for i, emoji in enumerate(emoji_list):
            self.found_emojis_[emoji] = '<EMOJI_%d>' % i

        return self

    def transform(self, X):
        # Validate
        check_is_fitted(self, ['found_emojis_'])

        # Transform
        data = pd.Series(X)
        for emoji, name in self.found_emojis_.items():
            data = data.str.replace(emoji, name, regex=False)

        return data.values

emojifier = Emojifier()
emojifier.fit(train_X)
print(emojifier.found_emojis_)

emojified_X = emojifier.transform(train_X)

In [None]:
for i, val in enumerate(train_X[:100]):
    m = re.findall(r'[:;Xx][)(\/D]|[)(\/D][:;x]', val)
    if len(m) > 0:
        print(set(m))
        print(i, val, train_y[i])
        print(i, emojified_X[i])


## Test full pipeline with current changes

In [None]:
pipeline = Pipeline([
    ('handles', RemoveHandles()),
    ('coded', RemoveCodedValues()),
    ('emoji', TokenizeEmoji()),
    ('emojify', Emojifier()),
    ('counts', CountVectorizer())
])

prepared_train_X = pipeline.fit_transform(train_X)

best_clf, score = test_classifiers(classifiers, prepared_train_X, train_y)

In [None]:
# Try best classifier in the validation dataset
val_X_transformed = pipeline.transform(val_X)
val_y_predicted = best_clf.predict(val_X_transformed)

accuracy = accuracy_score(val_y, val_y_predicted)
print('Accuracy on validation:', accuracy)

conf_matrix = confusion_matrix(val_y, val_y_predicted, normalize='true')
conf_matrix = pd.DataFrame(conf_matrix, columns=best_clf.classes_, index=best_clf.classes_)
# sums = np.sum(conf_matrix, axis=1, keepdims=True)


plt.figure(figsize=(10, 5))
sns.heatmap(conf_matrix, annot=True, fmt='.5f')
plt.show()

## Error analysis

Look at the errors that the classifier is doing to get where the problem could be
arising.

In [None]:
# Plot the previous heatmap, but with emphasis in the errors

np.fill_diagonal(conf_matrix.values, 0)

plt.figure(figsize=(10, 5))
sns.heatmap(conf_matrix, annot=True, fmt='.4f')
plt.show()

print(classification_report(val_y, val_y_predicted))
print(best_clf.classes_)

In [None]:
# Check some examples of errors

errors = val_y != val_y_predicted
errors_to_show = 10

for text, label, predicted in zip(val_X[errors], val_y[errors], val_y_predicted[errors]):
    print('%s -> %s\n\t%s' % (label, predicted, text))

    errors_to_show -= 1
    if errors_to_show == 0:
        break

## Hyper-parameter tweaking

Try to get the most of the classifiers by tweaking hyperparameters.

In [None]:
pipeline = Pipeline([
    ('handles', RemoveHandles()),
    ('coded', RemoveCodedValues()),
    ('emoji', TokenizeEmoji()),
    ('emojify', Emojifier()),
    ('counts', CountVectorizer()),
    ('clf', RandomForestClassifier(random_state=random_state))
])

grid_params = [{
    # 'handles': ['passthrough', RemoveHandles()], # Eliminate bias per author or mentioned people
    'coded': ['passthrough', RemoveCodedValues()],
    'emoji': ['passthrough', TokenizeEmoji()],
    'emojify': ['passthrough', Emojifier()],
    # 'clf__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)],
    # 'clf__max_features': ['auto', 'sqrt'],
    'clf__max_depth': [int(x) for x in np.linspace(10, 100, num=4)] + [None,],
    'clf__bootstrap': [True, False]
}]

grid_search = GridSearchCV(pipeline, grid_params, scoring='accuracy',
                           refit=True, cv=5, n_jobs=16,
                           verbose=2)
grid_search.fit(train_X, train_y)

print(grid_search.best_score_, grid_search.best_estimator_)