## Emotion analysis and classification of short comments using machine learning techniques
+ Code developed by: Douglas Maia dos Santos
+ Github acess: https://github.com/m-dougl/emotion-analysis

##### Importing libraries for proper code functioning
The purpose of this code is to test the impacto of the oversampling function on the models that were implemented in "main.ipynb"

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier 
from sklearn.ensemble import StackingClassifier
                              
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from unicodedata import normalize
from imblearn.over_sampling import SMOTE
import pandas as pd
import nltk
import numpy as np
import string
import emotion_analysis
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
nltk.download('rslp')
nltk.download('stopwords')
plt.style.use('seaborn')

### Pre processing part

In [None]:
df = emotion_analysis.open_dataset('dataset.xlsx', 'xlsx')
df.Emoção = df.Emoção.str.lower()

In [None]:
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_characters)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_accents)
df.Comentarios = df.Comentarios.apply(emotion_analysis.tokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_stopwords)
df.Comentarios = df.Comentarios.apply(emotion_analysis.untokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.stemming)

In [None]:
X, y = df.Comentarios, df.Emoção
X = emotion_analysis.vectorizer(X, 'tfidf')

In [None]:
train_size = .8
X_train, X_test, y_train, y_test = emotion_analysis.dataset_split(X, y, train_size)

In [None]:
n_fold = 10
NB  = emotion_analysis.cv_train(classifier_name='NB',  X=X_train, y= y_train, n_fold=n_fold)
SVM = emotion_analysis.cv_train(classifier_name='SVM', X=X_train, y= y_train, n_fold=n_fold)
KNN = emotion_analysis.cv_train(classifier_name='KNN', X=X_train, y= y_train, n_fold=n_fold)
'''
pred_nb  = NB.predict(X_test)   # Naive Bayes prediction
pred_svm = SVM.predict(X_test)  # SVM prediction
pred_knn = KNN.predict(X_test)  # KNN prediction
'''

In [None]:
SVM = SVC(C=1000, gamma= 1e-3, kernel = 'rbf').fit(X_train, y_train)
NB  = MultinomialNB(alpha = 1, fit_prior= False).fit(X_train, y_train)
KNN = KNeighborsClassifier(algorithm='auto', n_neighbors=5).fit(X_train, y_train)
estimators = [('svm', SVM),
              ('nb', NB),
              ('knn', KNN)]

In [None]:
n_fold = 10
ADA = emotion_analysis.cv_train(classifier_name='ADA', X=X_train, y= y_train, n_fold=n_fold)
XGB = emotion_analysis.cv_train(classifier_name='XGB', X=X_train, y= y_train, n_fold=n_fold)
CAT = emotion_analysis.cv_train(classifier_name='CAT', X=X_train, y= y_train, n_fold=n_fold)

In [None]:
ADA = AdaBoostClassifier(algorithm='SAMME', learning_rate=0.5).fit(X_train, y_train)
XGB  = XGBClassifier(eval_metric='mlogloss', learning_rate=0.1, max_depth=2).fit(X_train, y_train)
CAT = CatBoostClassifier(depth=4, iterations=5, learning_rate=0.01).fit(X_train, y_train)
estimators2 = [('ada', ADA),
              ('xgb', XGB),
              ('cat', CAT)]

### Ensemble: Votting Classifier

In [None]:
voting = VotingClassifier(estimators = estimators,
                          voting     = 'hard',
                          verbose    = True).fit(X_train, y_train)

In [None]:
pred_voting = voting.predict(X_test)

In [None]:
accuracy_voting  = accuracy_score(y_test, pred_voting)
precision_voting = precision_score(y_test, pred_voting, average='macro')
recall_voting    = recall_score(y_test, pred_voting, average='macro')
fscore_voting    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Voting Classifier Metrics:')
print(f'Accuracy: {accuracy_voting}')
print(f'Precision: {precision_voting}')
print(f'Recall: {recall_voting}')
print(f'F1-Score: {fscore_voting}')
print('='*20)

### Ensemble: Bagging Classifier

In [None]:
# svm
bagging_svm = BaggingClassifier(base_estimator = SVM,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

In [None]:
# naive bayes
bagging_nb = BaggingClassifier(base_estimator = NB,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

In [None]:
# knn
bagging_knn = BaggingClassifier(base_estimator = KNN,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

In [None]:
bagging = [bagging_svm.predict(X_test),
           bagging_nb.predict(X_test),
           bagging_knn.predict(X_test)]

In [None]:
bg_accuracy  = []
bg_precision = []
bg_recall    = []
bg_fscore    = []

labels = ['bg_SVM', 'bg_NB', 'bg_KNN']
for k in range(len(bagging)):
    bg_accuracy.append(accuracy_score(y_test, bagging[k]))
    bg_precision.append(precision_score(y_test, bagging[k], average = 'macro'))
    bg_recall.append(recall_score(y_test, bagging[k], average = 'macro'))
    bg_fscore.append(f1_score(y_test, bagging[k], average = 'macro'))
print('Bagging Classifiers Metrics:')
print('='*40)
for k in range(len(bagging)):
    print(f'{labels[k]} accuracy: {bg_accuracy[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Precision: {bg_precision[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Recall: {bg_recall[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} F1-Score: {bg_fscore[k]}')
print('='*40)

### Ensemble: RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

rf_predict = random_forest.predict(X_test)

accuracy_rf  = accuracy_score(y_test, rf_predict)
precision_rf = precision_score(y_test, rf_predict, average='macro')
recall_rf    = recall_score(y_test, rf_predict, average='macro')
fscore_rf    = f1_score(y_test, rf_predict, average='macro')
print('='*20)
print('Random Forest Classifier Metrics:')
print(f'Accuracy: {accuracy_rf}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1-Score: {fscore_rf}')
print('='*20)

### Ensemble: Gradient Tree boosting

In [None]:
gradient = GradientBoostingClassifier()
gradient.fit(X_train, y_train)

gradient_predict = gradient.predict(X_test)

accuracy_gradient  = accuracy_score(y_test, gradient_predict)
precision_gradient = precision_score(y_test, gradient_predict, average='macro')
recall_gradient    = recall_score(y_test, gradient_predict, average='macro')
fscore_gradient    = f1_score(y_test, gradient_predict, average='macro')
print('='*20)
print('Gradient Boosting Classifier Metrics:')
print(f'Accuracy: {accuracy_gradient}')
print(f'Precision: {precision_gradient}')
print(f'Recall: {recall_gradient}')
print(f'F1-Score: {fscore_gradient}')
print('='*20)

### Ensemble: Stacking Classifier

In [None]:
SVM = SVC(C=1000, gamma= 1e-3, kernel = 'rbf').fit(X_train, y_train)
NB  = MultinomialNB(alpha = 1, fit_prior= False).fit(X_train, y_train)
KNN = KNeighborsClassifier(algorithm='auto', n_neighbors=5).fit(X_train, y_train)
estimators = [('svm', SVM),
              ('nb', NB),
              ('knn', KNN)]

In [None]:
stacking = StackingClassifier(estimators = estimators,
                              final_estimator= GradientBoostingClassifier(),
                              verbose    = True).fit(X_train, y_train)

pred_stacking = stacking.predict(X_test)

accuracy_stacking  = accuracy_score(y_test, pred_stacking)
precision_stacking = precision_score(y_test, pred_stacking, average='macro')
recall_stacking    = recall_score(y_test, pred_stacking, average='macro')
fscore_stacking    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Stacking Classifier Metrics:')
print(f'Accuracy: {accuracy_stacking}')
print(f'Precision: {precision_stacking}')
print(f'Recall: {recall_stacking}')
print(f'F1-Score: {fscore_stacking}')
print('='*20)

### Neural Network Test

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes = (500,5),
                   activation         = 'relu',
                   solver             = 'adam',
                   alpha              = 1e-4,
                   batch_size         = 10,
                   learning_rate      = 'adaptive',
                   verbose            = True)
nn.fit(X_train, y_train)

In [None]:
pred_nn = nn.predict(X_test)

In [None]:
accuracy_nn  = accuracy_score(y_test, pred_nn)
precision_nn = precision_score(y_test, pred_nn, average='macro')
recall_nn    = recall_score(y_test, pred_nn, average='macro')
fscore_nn    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('nn Classifier Metrics:')
print(f'Accuracy: {accuracy_nn}')
print(f'Precision: {precision_nn}')
print(f'Recall: {recall_nn}')
print(f'F1-Score: {fscore_nn}')
print('='*20)

### Ensemble: Votting Classifier (Adaboost, Catboost e Xgboost)

In [None]:
voting = VotingClassifier(estimators = estimators2,
                          voting     = 'soft',
                          verbose    = True).fit(X_train, y_train)

In [None]:
accuracy_voting  = accuracy_score(y_test, pred_voting)
precision_voting = precision_score(y_test, pred_voting, average='macro')
recall_voting    = recall_score(y_test, pred_voting, average='macro')
fscore_voting    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Voting Classifier Metrics:')
print(f'Accuracy: {accuracy_voting}')
print(f'Precision: {precision_voting}')
print(f'Recall: {recall_voting}')
print(f'F1-Score: {fscore_voting}')
print('='*20)

### Ensemble: Bagging Classifier (Adaboost, Catboost e Xgboost)

In [None]:
# adaboost
bagging_ada = BaggingClassifier(base_estimator = ADA,
                                n_estimators   = 500,
                                bootstrap      = True,
                                verbose        = True).fit(X_train, y_train)

In [None]:
# xgboost
bagging_xgb = BaggingClassifier(base_estimator = XGB,
                                n_estimators   = 500,
                                bootstrap      = True,
                                verbose        = True).fit(X_train, y_train)

In [None]:
# catboost
bagging_cat = BaggingClassifier(base_estimator = CAT,
                                n_estimators   = 5,
                                bootstrap      = True,
                                verbose        = True).fit(X_train, y_train)

In [None]:
bagging = [bagging_ada.predict(X_test),
           bagging_xgb.predict(X_test),
           bagging_cat.predict(X_test)]

In [None]:
bg_accuracy  = []
bg_precision = []
bg_recall    = []
bg_fscore    = []

labels = ['bg_ADA', 'bg_XGB', 'bg_CAT']
for k in range(len(bagging)):
    bg_accuracy.append(accuracy_score(y_test, bagging[k]))
    bg_precision.append(precision_score(y_test, bagging[k], average = 'macro'))
    bg_recall.append(recall_score(y_test, bagging[k], average = 'macro'))
    bg_fscore.append(f1_score(y_test, bagging[k], average = 'macro'))
print('Bagging Classifiers Metrics:')
print('='*40)
for k in range(len(bagging)):
    print(f'{labels[k]} accuracy: {bg_accuracy[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Precision: {bg_precision[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Recall: {bg_recall[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} F1-Score: {bg_fscore[k]}')
print('='*40)

### Ensemble: Stacking Classifier  (Adaboost, Catboost e Xgboost)

In [None]:
stacking = StackingClassifier(estimators      = estimators2,
                              final_estimator = XGBClassifier(eval_metric='mlogloss'),
                              verbose         = True).fit(X_train, y_train)

pred_stacking = stacking.predict(X_test)

accuracy_stacking  = accuracy_score(y_test, pred_stacking)
precision_stacking = precision_score(y_test, pred_stacking, average='macro')
recall_stacking    = recall_score(y_test, pred_stacking, average='macro')
fscore_stacking    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Stacking Classifier Metrics:')
print(f'Accuracy: {accuracy_stacking}')
print(f'Precision: {precision_stacking}')
print(f'Recall: {recall_stacking}')
print(f'F1-Score: {fscore_stacking}')
print('='*20)