# Test results for balanced dataset
Test the performance of the obtained trained models on the **balanced dataset**. Create a table with performance scores.

## Import useful packages

In [None]:
# Generic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import pickle

In [None]:
# Scikit-learn for vectorizers and performance metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_curve, auc, f1_score, make_scorer, precision_recall_curve, matthews_corrcoef

In [None]:
# Keras preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import load_model

In [None]:
# NLTK for natural language processing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Custom helper-functions script (supplied to Colab manually)
import utils as uu

## Load data and assess performance

In [None]:
# Define file names
train_set_file = "train_set_imb_4.csv"
test_set_file  = "test_set_imb_4.csv"

In [None]:
# Set model-specific load and presentation parameters
models = []

# CNN model
models.append({
    "name":             "CNN",
    "linecolor":        'red',
    "linestyle":        '--', 
    "marker":           '.',
    "model_file":       "model_cnn_imb_4.h5",
    "vectorizer":       "KERAS",
    "is_dl_model":      True,
    "is_lex_model":     False,
    "max_features":     100000
        })

# RF model
models.append({
    "name":             "RF",
    "linecolor":        'orange',
    "linestyle":        '-', 
    "marker":           'v',
    "model_file":       "model_rf_imb_4.pckl",
    "vectorizer":       "COUNT",
    "is_dl_model":      False,
    "is_lex_model":     False,
    "max_features":     89403
        })

# LR model
models.append({
    "name":             "LR",
    "linecolor":        'green',
    "linestyle":        '-', 
    "marker":           '^',
    "model_file":       "model_lr_imb_4.pckl",
    "vectorizer":       "TFIDF",
    "is_dl_model":      False,
    "is_lex_model":     False,
    "max_features":     212435
        })

# FCNN model
models.append({
    "name":             "FCNN",
    "linecolor":        'deepskyblue',
    "linestyle":        '--', 
    "marker":           'o',
    "model_file":       "model_fcnn_imb_4.h5",
    "vectorizer":       "KERAS",
    "is_dl_model":      True,
    "is_lex_model":     False,
    "max_features":     100000
        })

# SVM model
models.append({
    "name":             "SVM",
    "linecolor":        'magenta',
    "linestyle":        '-', 
    "marker":           '+',
    "model_file":       "model_svm_imb_4.pckl",
    "vectorizer":       "TFIDF",
    "is_dl_model":      False,
    "is_lex_model":     False,
    "max_features":     2710
        })

# LSTM model
models.append({
    "name":             "LSTM",
    "linecolor":        'purple',
    "linestyle":        '--',
    "marker":           'None',
    "model_file":       "model_lstm_imb_4.h5",
    "vectorizer":       "KERAS",
    "is_dl_model":      True,
    "is_lex_model":     False,
    "max_features":     100000
        })

# NB model
models.append({
    "name":             "NB",
    "linecolor":        'lime',
    "linestyle":        '-', 
    "marker":           'x',
    "model_file":       "model_nb_imb_4.pckl",
    "vectorizer":       "COUNT",
    "is_dl_model":      False,
    "is_lex_model":     False,
    "max_features":     235808
        })

# KNN model
models.append({
    "name":             "KNN",
    "linecolor":        'blue',
    "linestyle":        '-', 
    "marker":           'None',
    "model_file":       "model_knn_imb_4.pckl",
    "vectorizer":       "TFIDF",
    "is_dl_model":      False,
    "is_lex_model":     False,
    "max_features":     100000
        })

# Lexicon model
models.append({
    "name":             "Lexicon",
    "linecolor":        'black',
    "linestyle":        '-.', 
    "marker":           'd',
    "model_file":       "NA",
    "vectorizer":       "NONE",
    "is_dl_model":      False,
    "is_lex_model":     True,
    "max_features":     100000
        })

In [None]:
# Set parameters for tokenization
max_words = 5000
max_len = 55

In [None]:
# Define a UL lexicon for label "1"
dict1 = ['climate',
 'climatechange',
 'globalwarming',
 'agw',
 'climaterealists']

In [None]:
# Compute performance

# Load datasets
train_set = pd.read_csv(train_set_file)
test_set = pd.read_csv(test_set_file)

# Preprocess texts
train_set['text'] = train_set['text'].apply(str)
train_set['text'] = train_set['text'].apply(uu.preprocess_text)
test_set['text'] = test_set['text'].apply(str)
test_set['text'] = test_set['text'].apply(uu.preprocess_text)

# Get features and labels
texts_train = copy.deepcopy(train_set['text'])
labels_train = copy.deepcopy(train_set['is_about_cc'])
texts_test = copy.deepcopy(test_set['text'])
labels_test = copy.deepcopy(test_set['is_about_cc'])

# Performance dataframe
perf = []

for model_idx in range(len(models)):

  print("model_idx = " + str(model_idx))
  
  # Load model
  if (models[model_idx]["is_dl_model"] == True):
    model = load_model(models[model_idx]["model_file"])
  elif (models[model_idx]["is_lex_model"] == False):
    model = pickle.load(open(models[model_idx]["model_file"], 'rb'))

  if (models[model_idx]["vectorizer"] == "KERAS"):
    # Tokenize features
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts_train)
    sequences_train = tokenizer.texts_to_sequences(texts_train)
    features_train = sequence.pad_sequences(sequences_train, maxlen=max_len, padding='post', truncating='post')
    sequences_test = tokenizer.texts_to_sequences(texts_test)
    features_test = sequence.pad_sequences(sequences_test, maxlen=max_len, padding='post', truncating='post')
  elif (models[model_idx]["vectorizer"] == "TFIDF"):
    # Vectorize features
    vectorizer = TfidfVectorizer(max_features=models[model_idx]["max_features"], lowercase=True, analyzer='word', dtype=np.float32)
    vectorizer.fit(texts_train)
    features_train = vectorizer.transform(texts_train)
    features_test = vectorizer.transform(texts_test)
  elif (models[model_idx]["vectorizer"] == "COUNT"):
    vectorizer = CountVectorizer()
    vectorizer.fit(texts_train)
    features_train = vectorizer.transform(texts_train)
    features_test = vectorizer.transform(texts_test)
    
  # Compute performance metrics

  # Predicted labels
  if (models[model_idx]["is_lex_model"] == True):
    pred_labels = [1 if any(word in text.split() for word in dict1) else 0 for text in texts_test]
  else:
    pred_labels = (model.predict(features_test) > 0.5).astype(int)

  # Accuracy, precision, recall and F1 score
  acc, prec, rec, f1 = uu.compute_perf_metrics(labels_test, pred_labels)

  # Prediction scores
  if (models[model_idx]["name"] == "LR") or (models[model_idx]["name"] == "SVM"):
    pred_scores = model.decision_function(features_test)
  elif (models[model_idx]["name"] == "RF") or (models[model_idx]["name"] == "NB") or (models[model_idx]["name"] == "KNN"):
    pred_scores = model.predict_proba(features_test)[:, 1]
  elif (models[model_idx]["name"] == "Lexicon"):
    pred_scores = [uu.occurrence_counter(text.split(), dict1) for text in texts_test]
  elif (models[model_idx]["name"] == "CNN") or (models[model_idx]["name"] == "FCNN") or (models[model_idx]["name"] == "LSTM"):
    pred_scores = model.predict(features_test).ravel()

  # Area under ROC curve
  fpr, tpr = uu.compute_roc(labels_test, pred_scores)
  roc_auc = auc(fpr, tpr)

  # Area uner PR curve
  precs, recs = uu.compute_pr(labels_test, pred_scores)
  pr_auc = auc(recs, precs)

  # Matthews correlation coeficient
  mcc = matthews_corrcoef(labels_test, pred_labels)

  # Gather all metrics
  perf.append([models[model_idx]["name"], acc, prec, rec, f1, roc_auc, pr_auc, mcc])

# Combine into a dataframe
df_perf = pd.DataFrame(perf, columns=['Method', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'AUC ROC', 'AUC PR', 'MCC'])

model_idx = 0
model_idx = 1
model_idx = 2
model_idx = 3
model_idx = 4
model_idx = 5
model_idx = 6
model_idx = 7
model_idx = 8


In [None]:
# Performance comparison
df_perf

Unnamed: 0,Method,Accuracy,Precision,Recall,F1 score,AUC ROC,AUC PR,MCC
0,CNN,0.970551,0.977128,0.963659,0.970347,0.990883,0.992459,0.941192
1,RF,0.970551,0.992136,0.948622,0.969891,0.99077,0.993153,0.942009
2,LR,0.969298,0.980745,0.957393,0.968928,0.990146,0.992391,0.938863
3,FCNN,0.968045,0.977011,0.958647,0.967742,0.989988,0.991918,0.936256
4,SVM,0.968045,0.979461,0.95614,0.96766,0.988149,0.991014,0.936356
5,LSTM,0.961153,0.956576,0.966165,0.961347,0.986301,0.987585,0.922352
6,NB,0.960526,0.95539,0.966165,0.960748,0.989931,0.992488,0.921111
7,KNN,0.895363,0.846323,0.966165,0.902282,0.972934,0.974146,0.798776
8,Lexicon,0.875313,0.995041,0.754386,0.858161,0.875384,0.93621,0.773593


In [None]:
# Save into a file
df_perf.to_csv("tab_bal_metrics.csv", index=False)