# Classification task

In [None]:
# installation of packages
!pip install nltk

In [None]:
# installation of NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## Data pre-processing

In [None]:
# loading data
import ast
import pandas as pd

# reading the dataset 
df = pd.read_csv('../data/dataset_filtered_labels.csv')

# getting the list of genres 
genres = set()
for v in df['genres'].values: genres = set(list(genres) + ast.literal_eval(v))
genres = list(genres)

# creating the mappings from genres to id and vice versa
genre2id = {k:v for (v, k) in enumerate(genres)}
id2genre = {k:v for (k, v) in enumerate(genres)}

df.head()

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = df['summary'].to_numpy() # corpus - predictor variables
Y = np.full((X.shape[0], len(genres)), 0, dtype=int) # genres - target variables

# populating Y

genre_data = df['genres'].to_numpy() # genres assigned to works
for idx in range(len(Y)):
    genre_data[idx] = ast.literal_eval(genre_data[idx])
    for g in genre_data[idx]: Y[idx][genre2id[g]] = 1


In [None]:
%%script false --no-raise-error
# Search of optimal value for min_df

for val in [0.00375, 0.00625, 0.0075, 0.00875]:
    opt_X_train, opt_X_test, opt_Y_train, opt_Y_test = train_test_split(X, Y, random_state=2023)
    vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=val)
    base_twcnb = ComplementNB()
    clf_twcnb = OneVsRestClassifier(base_twcnb)
    opt_X_train = vectorizer.fit_transform(opt_X_train)
    opt_X_test = vectorizer.transform(opt_X_test)
    clf_twcnb.fit(opt_X_train, opt_Y_train)
    print(f'{val * 100: .4f}% -> {clf_twcnb.score(opt_X_test, opt_Y_test)}')

# 0.000% -> 0.060659186535764374
# 0.125% -> 0.3159186535764376 < OPTIMAL >
# 0.250% -> 0.28856942496493687
# 0.375% -> 0.258765778401122
# 0.500% -> 0.22300140252454417
# 0.625% -> 0.20196353436185133
# 0.750% -> 0.21608598962194217
# 0.875% -> 0.20570793180133431
# 1.000% -> 0.15252454417952313

In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# defining tokenizer which performs lemmatization and skips stop or/and non-alphabetic words
class LemmaTokenizer:
    def __init__(self):
        self.stops = set(stopwords.words('english'))
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t).lower() for t in word_tokenize(doc) if self.wnl.lemmatize(t) not in self.stops and t.isalpha()]
    
# defining TF-IDF vectorizer
# we put threshold of 1% for term presence in summaries so we can filter out the least common terms
# which can cause the overfitting of the classifier (this choice is backed by Zipf's law)
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=0.00125)

## Classifiers

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV, cross_validate, KFold

from sklearn.metrics import make_scorer, accuracy_score, hamming_loss

from datetime import datetime
from scipy.stats import randint
import pickle
import os

In [None]:
# report performance
def report_performance_on_test_sets(scores):
    import numpy as np
    
    print('F1 micro score: mean = %.3f, standard deviation = %.3f' %(np.mean(scores['test_F1 micro']), 
                                                                 np.std(scores['test_F1 micro'])))
    
    print('Accuracy: mean = %.3f, standard deviation = %.3f' %(np.mean(scores['test_Accuracy']), 
                                                                 np.std(scores['test_Accuracy'])))
    
    print('Hamming loss: mean = %.3f, standard deviation = %.3f' %(np.mean(scores['test_Hamming loss']), 
                                                                 np.std(scores['test_Hamming loss'])))
    


### TWCNB

In [None]:
st=datetime.now() 

base_twcnb = ComplementNB()
clf_twcnb = OneVsRestClassifier(base_twcnb)

steps = list()
steps.append(('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=0.00125)))
steps.append(('model', clf_twcnb))
pipeline = Pipeline(steps=steps)

cv = KFold(n_splits=5, shuffle=True, random_state=2023)

scoring = {"F1 micro": "f1_micro", "Accuracy": make_scorer(accuracy_score), 
           "Hamming loss": make_scorer(hamming_loss)}

mnb_scores = cross_validate(pipeline, X=X, y=Y, cv=cv, scoring = scoring, return_estimator = True)

print("Time taken to perform multiple model evaluation: ",datetime.now()-st)
report_performance_on_test_sets(mnb_scores)

In [None]:
#with open('../data/scores/redgenres/TWCNB_scores_red.pkl', 'wb') as f:
#    pickle.dump(twcnb_scores, f)

In [None]:
with open('../data/scores/allgenres/TWCNB_scores.pkl', 'rb') as g:
    twcnb_scores = pickle.load(g)

### Logistic regression

In [None]:
def tune_and_evaluate_model(model, param_space, X, Y):
    
    # define the pipeline
    steps = list()
    steps.append(('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=0.00125)))
    steps.append(('model', model))
    pipeline = Pipeline(steps=steps)

    inner_cv = KFold(n_splits=2, shuffle=True, random_state=2023) # for hyperparameter tuning
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=2023)

    #defining scoring for k fold cross validation
    scoring = {"F1 micro": "f1_micro", "Accuracy": make_scorer(accuracy_score), 
               "Hamming loss": make_scorer(hamming_loss)}

    # Nested CV with parameter optimization
    # for  hypoparameter tuning f1 score will be used to find the best parameters for refitting the 
    # estimator at the end. It is specified with the parameter refit = 'F1 micro'.
    clf = RandomizedSearchCV(pipeline, param_space, n_iter=3, scoring = scoring, cv = inner_cv, 
                             refit = 'F1 micro')
    
    scores = cross_validate(clf, X=X, y=Y, cv=outer_cv, return_estimator = True, scoring = scoring)

    # report performance
    report_performance_on_test_sets(scores)
    return scores

In [None]:
st=datetime.now() 

clf_lr = OneVsRestClassifier(LogisticRegression(max_iter = 200))
param_space = {
              'model__estimator__C':[0.01,0.1,1,5,10] 
              }
lr_scores = tune_and_evaluate_model(clf_lr, param_space, X, Y)

print("Time taken to perform hyperparameter tuning and multiple model evaluation: ",datetime.now()-st)

In [None]:
#with open('../data/scores/redgenres/LR_scores_red.pkl', 'wb') as g:
#    pickle.dump(lr_scores, g)

In [None]:
with open('../data/scores/allgenres/LR_scores.pkl', 'rb') as g:
    lr_scores = pickle.load(g)

### Random forest classifier
Inherently supports mulitilabel classification

In [None]:
st=datetime.now() 

rf = RandomForestClassifier(n_estimators = 2)

param_space = {
              'model__max_depth':list(np.arange(10, 250, step=50))
              }
rf_scores = tune_and_evaluate_model(rf, param_space, X, Y)
print("Time taken to perform hyperparameter tuning and multiple model evaluation: ",datetime.now()-st)


In [None]:
#with open('../data/scores/redgenres/RF_scores_red.pkl', 'wb') as h:
#    pickle.dump(rf_scores, h)

In [None]:
with open('../data/scores/allgenres/RF_scores.pkl', 'rb') as h:
    rf_scores = pickle.load(h)

### XGBoost

In [None]:
# define the classifier
st=datetime.now() 

xgboost = xgb.XGBClassifier(objective = 'multi:softmax',                                  
                            seed = 2023,  
                            num_class=2,
                            gamma =  0.1,
                            learning_rate = 0.5,
                            n_estimators = 200
                            
                         ) 
one_vs_rest_xgboost = OneVsRestClassifier(xgboost)

param_space = {
              'model__estimator__max_depth':[2, 5]
              }
xgboost_scores = tune_and_evaluate_model(one_vs_rest_xgboost, param_space, X, Y)
print("Time taken to perform hyperparameter tuning and multiple model evaluation: ",datetime.now()-st)

In [None]:
#with open('../data/scores/redgenres/XGBoost_scores_red.pkl', 'wb') as i:
#    pickle.dump(xgboost_scores, i)

In [None]:
with open('../data/scores/allgenres/XGBoost_scores.pkl', 'rb') as i:
    xgboost_scores = pickle.load(i)

## Evaluation of the models 

In [None]:
import matplotlib.pyplot as plt
plt.style.use(['./mplstyles/science.mplstyle', './mplstyles/nature.mplstyle'])

In [None]:
# plotting box plots for scores

models_scores_f1 = {'TWCNB': mnb_scores['test_F1 micro'], 
                    'LR': lr_scores['test_F1 micro'],
                    'RF': rf_scores['test_F1 micro'],
                    'XGBoost' : xgboost_scores['test_F1 micro']
                   }
models_scores_accuracy = {'TWCNB': mnb_scores['test_Accuracy'], 
                          'LR': lr_scores['test_Accuracy'],
                          'RF': rf_scores['test_Accuracy'],
                          'XGBoost' : xgboost_scores['test_Accuracy']
                         }
models_scores_hamming = {'TWCNB': mnb_scores['test_Hamming loss'], 
                         'LR': lr_scores['test_Hamming loss'],
                         'RF': rf_scores['test_Hamming loss'],
                         'XGBoost' : xgboost_scores['test_Hamming loss']
                        }

fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.subplots_adjust(right=1.5, wspace=0.5)
fig.suptitle(f'Models\' performances on 15 genres', y=0.95, x=0.75)

ax1.boxplot(models_scores_f1.values(), showmeans=True, meanline = True)
ax1.set_xticklabels(models_scores_f1.keys())
ax1.set_xlabel("Model")
ax1.set_ylabel("F1 (micro) score")

ax2.boxplot(models_scores_accuracy.values(), showmeans=True, meanline = True)
ax2.set_xticklabels(models_scores_accuracy.keys())
ax2.set_xlabel("Model")
ax2.set_ylabel("Accuracy")

ax3.boxplot(models_scores_hamming.values(), showmeans=True, meanline = True)
ax3.set_xticklabels(models_scores_hamming.keys())
ax3.set_xlabel("Model")
ax3.set_ylabel("Hamming loss")

In [None]:
# plottin F1 scores
from math import sqrt

fig, ax = plt.subplots()
xbar = np.arange(len(models_scores_f1.keys()))
ybar = [v.mean() for v in models_scores_f1.values()]
stdev = [np.std(v) for v in models_scores_f1.values()]
ax.bar(xbar, ybar, yerr=stdev, align='center', alpha=0.5, ecolor='black', capsize=10)
ax.set_ylabel('F1 score')
ax.set_xticks(xbar)
ax.set_xticklabels(models_scores_f1.keys())
ax.yaxis.grid(True)

plt.tight_layout()
plt.show()
fig.savefig('../gfx/allgenres_f1.jpg', dpi=300)

print('Standard errors:', np.array(stdev) / sqrt(len(list(models_scores_f1.values())[0])))

In [None]:
# obtaining standard errors

f1s = np.array([np.std(v) for v in models_scores_f1.values()]) / sqrt(len(list(models_scores_f1.values())[0]))
accs =  np.array([np.std(v) for v in models_scores_accuracy.values()]) / sqrt(len(list(models_scores_accuracy.values())[0]))
hls = np.array([np.std(v) for v in models_scores_hamming.values()]) / sqrt(len(list(models_scores_hamming.values())[0]))

print('Standard errors')
for i in range(len(f1s)):
    print('F1:', round(f1s[i], 4))
    print('Acc:', round(accs[i], 4))
    print('HL:', round(hls[i], 4))
    print()

In [None]:
# plotting fitting times

fig, ax = plt.subplots()
X = list(range(len(mnb_scores['fit_time'])))
Ys = [mnb_scores['fit_time'], lr_scores['fit_time'], rf_scores['fit_time'], xgboost_scores['fit_time']]

models = ['TWCNB', 'LR', 'RF', 'XGBoost']
xbar = np.arange(len(models))
ybar = [v.mean() for v in Ys]
stdev = [np.std(v) for v in Ys]
ax.bar(xbar, ybar, yerr=stdev, align='center', alpha=0.5, ecolor='black', capsize=10)
ax.set_xlabel('Model')
# ax.set_ylabel('Mean fit time [s]')
ax.set_xticks(xbar)
ax.set_xticklabels(models_scores_f1.keys())
ax.yaxis.grid(True)

plt.tight_layout()
plt.show()
fig.savefig('../gfx/allgenrestimes.jpg', dpi=300)

print('Standard errors:', np.array(stdev) / sqrt(len(list(models_scores_f1.values())[0])))