## Multilabel Multiclass classification

This notebook tries to model the dataset using Multilabel Multiclass classification.
- all the labels except primary_label is used as multi label input (note: primary_label is already taken into account in this set). 
- in this notebook, we first try different classifiers (with default parameters) and compare their accuracy performance. We then evaluate the performance of the best performing classifier using GridSearchCV.
- before classification, we preprocess the data using class:classifier. features are representing using tfidf method and chi2 method is used for feature selection.


#### import statements

In [None]:
from classes.Classifiers.Classifiers import Classifiers
from classes.Classifiers.ClassBalancer import ClassBalancer
from classes.Classifiers.FeatureSelector import FeatureSelector
from classes.Classifiers.ModelComparison import ModelComparison
from classes.Utils.PandasUtils import PandasUtils

In [None]:
#numerical
import numpy as np
import pandas as pd
import math
from glob import glob 

#os related operations
import os 
from os import listdir
from os.path import isfile, join

#data structures
from collections import Counter
from itertools import chain
from itertools import islice

#visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
sns.set(style="darkgrid")

import warnings
warnings.filterwarnings('ignore')

In [None]:
#sklearn
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_selection import SelectKBest, chi2,f_classif 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve  

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,SVC #multiclass
from sklearn.linear_model import LogisticRegression #multiclass
from sklearn.ensemble import RandomForestClassifier #multilabel, multioutput
from sklearn.tree import DecisionTreeClassifier  #multilabel, multioutput
from sklearn.ensemble import GradientBoostingClassifier #multiclass

#### set path

In [None]:
path = os.getcwd()
results_path = os.path.join(path,'results','multi-label-classification','')
features_path = os.path.join(path,'features','')

In [None]:
if os.path.exists(results_path):
    pass
else:
    os.mkdir(results_path)

In [None]:
# results are stored in a dict
subset_acc_default = {}
hamming_loss_default = {}
f1score_default = {}
jaccard_default = {}

In [None]:
content='code'
setting = '_'+content+'_'+'scaled'
settings = [m+setting for m in ['rf','dt','gb','lsvc','svc','log'] ]

for model_name in settings:
    jaccard_default[model_name] = {}
    f1score_default[model_name] = {}
    subset_acc_default[model_name] = {}
    hamming_loss_default[model_name] = {}

#### load data

In [None]:
test_features = pd.read_pickle(features_path+'test_features.pkl')
train_features = pd.read_pickle(features_path+'train_features.pkl')
validation_features = pd.read_pickle(features_path+'validation_features.pkl')

train_features.index = range(train_features.shape[0])
validation_features.index = range(validation_features.shape[0])
test_features.index = range(test_features.shape[0])

train_features.fillna(0,inplace=True)
test_features.fillna(0,inplace=True)
validation_features.fillna(0,inplace=True)

print(train_features.shape)
print(validation_features.shape)
print(test_features.shape)

#### labels

In [None]:
labels = ['helper_functions','load_data',
              'data_preprocessing','data_exploration',
              'modelling','evaluation','prediction',
              'result_visualization','save_results',
              'comment_only']

#### training + validation dataset!

In [None]:
df = pd.DataFrame()
#df= df.add(features)
df = df.add(train_features)
df = df.add(validation_features)
df.shape

In [None]:
print("Total notebook in training+validation: ",len(set(df.filename.values)))
print("Total notebook in test: ",len(set(test_features.filename.values)))

### Classifier Setup and Feature Engineering

### Data Preprocessing

#### Categorisation of features 
*if necessary

In [None]:
def categorize(df):
    cat = pd.Categorical(df['primary_label'].values, categories=labels)
    y, uniques = pd.factorize(cat)
    df['primary_label'] = np.asarray(y)
    
    cat = pd.Categorical(df['filename'].values)
    files, uniques = pd.factorize(cat)
    df['filename'] = np.asarray(files)
    
    return df,files,cat

In [None]:
temp_df = pd.DataFrame()
temp_df = temp_df.add(df.copy())
temp_df = temp_df.add(test_features.copy())
temp_df.index = range(temp_df.shape[0])
temp_df,cat,files = categorize(temp_df)

df1 = temp_df[0:df.shape[0]].copy()
df1.index = range(df1.shape[0])
print(df1.primary_label.unique())
print(df1.shape)

df2 = temp_df[df.shape[0]:].copy()
df2.index = range(df2.shape[0])
print(df2.primary_label.unique())
print(df2.shape)

For a pre-defined split of validation set for crossfold validation

In [None]:
test_fold = []
for i in range(0,train_features.shape[0]):
    test_fold.append(0)

In [None]:
folds = {}
t_fold = 1
count = 1
for each in set(validation_features.filename.values):
    folds[each] = t_fold 
    if count%10 == 0:
        t_fold += 1
    count += 1

In [None]:
for idx,row in validation_features.iterrows():
    test_fold.append(folds[row['filename']])

set the split

In [None]:
ps = PredefinedSplit(test_fold=test_fold)
print(set(test_fold))

#### Set up the classifier and indicate conditions to restrict the dataframe we will be working on

In [None]:
model = Classifiers(df1,labels) #provide the training set and labels
conditions = (df1.cell_type == 'code')
model.apply_conditions_to_dataframe(conditions) #a restricted dataframe is created

(train,test,indices_train,indices_test)=model.test_train_data_set(df2) #provide the test set

### Feature Engineering (Representation and Selection)

In [None]:
#'text','comment'
# extend: 'code_line_before','code_line_after', 'markdown_heading', 'packages_info'
features = ['text']
train,test = model.set_lexical_features(features)
#same df is modified and contains our new feature column *new_text*

#### apply preprocessing to the text feature
many custom preprocessing functions are available inside the class Preprocessing, check out!

here, we will use code_text_processing function to process our lexical feature *new_text* we created previously

processed text will be available in the feature column *text_processed*

In [None]:
train,test = model.preprocessing('new_text')

##### tfidf and chi2

In [None]:
#vectorize the text features
tfidf = TfidfVectorizer(ngram_range=(1,3),use_idf=True,max_df=0.2,min_df=2,stop_words='english') 
X_train,X_test,tfidf = model.vectorization(tfidf)
print(X_train.shape)

#use feature selection if necessary 
k = 2000 #k lexical features to be retained
X_train_features,X_test_features,selector = model.feature_selection(chi2,k,train[labels])

In [None]:
tfidf_features = tfidf.get_feature_names()
print("tfidf features: ",len(tfidf_features))
print("before feature selection: ",len(selector.get_support()))
selected_features = selector.get_support()
text_features = [tfidf_features[i] for i in range(len(tfidf.get_feature_names())) if selected_features[i]==True]
print("after feature selection: ",len(text_features))

In [None]:
#following set of features are represented in numerical form
#metric features are: ['linesofcomment','linesofcode','variable_count','function_count']
#extended features are: ['filename','cell_number','execution_count','text/plain' , 'image/png', 'text/html', 'execute_result', 'display_data', 'stream', 'error']

stat_features = ['linesofcomment','linesofcode','variable_count','function_count']
X_train_features_,X_test_features_ = model.set_statistical_features(stat_features,X_train_features,X_test_features)

#### Together, text_features and stat_features form the original feature vector of our model

In [None]:
feature_vector = tfidf_features+stat_features

#### Standardize the features 
since our statistical features are scaled in a different way to our text vectors

In [None]:

from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()
X_train_features_ = ss.fit_transform(X_train_features_)
X_test_features_ = ss.transform(X_test_features_)


## Classifier Models

## 1. Compare classifiers
*default params

In [None]:
ks = [k] 
for k in ks:
    print(setting)
    model_pipelines = [('rf',Pipeline([('clf', RandomForestClassifier(random_state=500))])),
                       ('dt',Pipeline([('clf', DecisionTreeClassifier(random_state=500))])),
                       ('gb',Pipeline([('clf', GradientBoostingClassifier(random_state=500))])),
                       ('lsvc',Pipeline([('clf', LinearSVC(random_state=500))])),
                       ('svc',Pipeline([('clf', SVC(random_state=500))])),
                       ('log',Pipeline([('clf', LogisticRegression(random_state=500))])),
                       ]
    
    for pipe in model_pipelines:
        model_name = pipe[0]
        pipeline = pipe[1]
        print('model: ',model_name, k)

        # Multilabel classifier
        model = OneVsRestClassifier(pipeline).fit(X_train_features_,train[labels])
        prediction = model.predict(X_test_features_)
        subset_acc = accuracy_score(test[labels], prediction)
        model_name = model_name+setting
        #subset_acc_default[model_name][k] = subset_acc
        hamming = metrics.hamming_loss(test[labels], prediction)
        #hamming_loss_default[model_name][k] = hamming
        jaccard = metrics.jaccard_score(test[labels], prediction,average="weighted")
        f1score = metrics.f1_score(test[labels], prediction, average = 'weighted')
        #jaccard_default[model_name][k] = jaccard
        #f1score_default[model_name][k] = f1score
        print(' subset acc: ',subset_acc)
        print(' Hamming: ',hamming,' jaccard: ', jaccard, 'f1score: ', f1score)


In [None]:
#pd.DataFrame.from_dict(f1score_default).to_pickle(results_path+'subset_acc_default.pkl')
#pd.DataFrame.from_dict(acc_default).to_pickle(results_path+'hamming_loss_default.pkl')
#pd.DataFrame.from_dict(f1score_default).to_pickle(results_path+'f1score_default.pkl')
#pd.DataFrame.from_dict(acc_default).to_pickle(results_path+'jaccard_default.pkl')

In [None]:
pd.DataFrame.from_dict(f1score_default).max().max()

for k,v in f1score_default.items():
    for key,value in v.items():
        if value >= pd.DataFrame.from_dict(f1score_default).max().max():
            print(k,key,value)

In [None]:
f1f = pd.DataFrame.from_dict(f1score_default)
saf = pd.DataFrame.from_dict(subset_acc_default)
hlf = pd.DataFrame.from_dict(hamming_loss_default)
jcf = pd.DataFrame.from_dict(jaccard_default)

f1score_dict, sacc_dict, hamming_dict, jaccard_dict = {},{},{},{}
for col in f1f.T[2000].index.values:
    if '_scaled' in col:
        f1score_dict[col] = round(f1f[col][1000],3)
        sacc_dict[col] = round(saf[col][1000],3)
        hamming_dict[col] = round(hlf[col][1000],3)
        jaccard_dict[col] = round(jcf[col][1000],3)

In [None]:
slabel = pd.DataFrame({'model':f1score_dict.keys(),
                       'subset accuracy':sacc_dict.values(),
                      'hamming loss':hamming_dict.values(),
                      'jaccard score':jaccard_dict.values(),
                       'f1 score':f1score_dict.values()})
#slabel.index = slabel['model']
#slabel.to_pickle(results_path+'multi_label_results.pkl')
slabel

## 2. Compare Classifiers: Best parameters using GridSearchCV

In [None]:
results_all = {}
subset_accuracy_all = {}
hamming_loss_all = {}
jaccard_sim_all = {}
estimators_all = []
f1_score_all = {}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.add(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

parameters = {'rf':{}}
parameters['rf']['estimator__rf__criterion'] = ['gini','entropy']
parameters['rf']['estimator__rf__n_estimators'] = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
parameters['rf']['estimator__rf__class_weight']= ['balanced', 'balanced_subsample']

k = 2000
print("******",k,"*******")
model_pipelines = [('rf',Pipeline([('rf', RandomForestClassifier(random_state=500))]))]
    
for pipe in model_pipelines:
    model_name = pipe[0]+'['+content+']'
    pipeline = pipe[1]

    print('model: ',model_name)
    results_all[model_name] = {}
    # Multilabel classifier    
    CV = GridSearchCV(OneVsRestClassifier(pipeline), parameters[pipe[0]], scoring = 'accuracy', n_jobs= 1,cv=ps)
    CV.fit(X_train_features_,train[labels])
    print(CV.best_estimator_.classes_)
    estimators_all.add(CV)
    prediction_multi = CV.predict(X_test_features_)
    #evaluate
    subset_acc = accuracy_score(test[labels], prediction_multi)
    hamming = metrics.hamming_loss(test[labels], prediction_multi)
    jaccard = metrics.jaccard_score(test[labels], prediction_multi, average = 'weighted')
    f1score = metrics.f1_score(test[labels], prediction_multi, average = 'weighted')
    print(' subset acc: ',subset_acc)
    print(' Hamming: ',hamming,' jaccard: ', jaccard)
    print(' f1score: ',f1score)
    subset_accuracy_all[model_name+str(k)] = subset_acc
    hamming_loss_all[model_name+str(k)] = hamming
    jaccard_sim_all[model_name+str(k)] = jaccard   
    f1_score_all[model_name+str(k)] = f1score        


In [None]:
print(classification_report(test[labels], prediction_multi, target_names=labels))

In [None]:
rowsums = test[labels].iloc[:,0:].sum(axis=1)
true=rowsums.value_counts()
print(true)
rowsums = pd.DataFrame(prediction_multi).iloc[:,0:].sum(axis=1)
pred=rowsums.value_counts()
print(pred)

In [None]:
idxs, types, values = [], [], []
for idx in pred.index.values:
    try:
        values.add(pred.iloc[idx])
        types.add('pred')
        idxs.add(idx)
    except:
        values.add(0)
        types.add('pred')
        idxs.add(idx)
    try:
        values.add(true.iloc[idx])
        types.add('true')
        idxs.add(idx)
    except:
        values.add(0)
        types.add('true')
        idxs.add(idx)
plot = pd.DataFrame({'no of labels':idxs,'% distribution':values,'type':types})

In [None]:
sns.set(rc={'figure.figsize':(9,3)})
g = sns.catplot(y = '% distribution', x ='no of labels', hue='type', data=plot, kind='bar',legend=False)    
title = "Labels per cell"
for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width()-0.2,p.get_y()+p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points',
                   fontsize=10)
#ax.xaxis.grid(False)
ax.set_xlabel("% of code cells in the dataset",fontsize=12,fontweight='bold')
ax.set_ylabel("",fontsize=12,fontweight='bold')
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
#axes=plt.gca()
ax.set(ylim=(0, 1750))
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig(results_path+'labeldistribution.eps', format='eps')

In [None]:
dfpred = pd.DataFrame(prediction_multi,columns=labels)
dfpred['filename'] = df2.filename
dfpred.to_pickle(results_path+'prediction_multi.pkl')