In [1]:
# loading libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Reading Dataset

In [9]:
# reading the annotated dataset
df = pd.read_csv('cre_root_detection.csv')

# Exploratory Data Analysis

In [10]:
#1- Checking Missing Values
missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().sum()/82790)*100})
missing_data

In [12]:
# Handling Missing Values (freq note, freq weighted acc	)
df['freq note'].fillna(df['freq note'].mode()[0], inplace = True)
df['freq weighted acc'].fillna(df['freq weighted acc'].mode()[0], inplace = True)
# finding if there is any null value
df.isnull().sum()

In [13]:
# Exploring 'expert assigned' variable
df['expert assigned'].value_counts()

In [16]:
# Getting index value where 'expert assigned' = 5, and then drop that value because it has less number of entries   
df.drop(df[(df['expert assigned'] == 5)].index , inplace=True)

# Cohen Kappa Score computation

In [19]:
# we dont want to include ('title', 'certainty', 'root') in the analysis, therefore, we want to remove it
newDf = df.drop(['title', 'certainty', 'root'], axis=1)

In [21]:
from sklearn.metrics import cohen_kappa_score
# Calling DataFrame constructor  
arr = []
for item in newDf:
    col = []
    for item2 in newDf:
        col.append(cohen_kappa_score(newDf[item], newDf[item2]))
    arr.append(col)
    
mydf = pd.DataFrame(arr)
mydf = pd.DataFrame(data=mydf.values, columns=newDf.columns)


# Plotting Cohen Kappa Correlation

In [22]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

fig = plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
plt.clf()
ax = fig.add_subplot(111)
ax.set_aspect(1)
res = sns.heatmap(mydf.values, annot=True, fmt='.2f', cmap="YlGnBu", vmin=0.0, vmax=1.0)
plt.title('Cohen Kappa Score',fontsize=12)

plt.xticks([i+0.5 for i in range(mydf.values.shape[0])], [str(element) for element in mydf])
plt.xticks(rotation=90)

plt.yticks([i+0.5 for i in range(mydf.values.shape[1])], [str(element) for element in mydf])
plt.yticks(rotation=0)

plt.savefig("Cohen Kappa Correlation.pdf", bbox_inches='tight', dpi=100)
plt.show()

# GLOBAL SETTINGS

In [34]:
#FEATURE VECTOR SETTINGS => This will allow us to add/drop (1/0) certain features for classification 
Krumhansl_Shmuckler = 1 
simple_weights = 1
Aarden_Essen = 1 
Bellman_Budge = 1
Temperly_Kostka_Payne = 1  
as_transcribed = 1 
final_note = 1 
freq_note = 1 
freq_weighted_acc = 1
certainty = 0 # Removing this due to low correlation
root = 0 # this was removed upon Danny's suggetion [Dont know what it is??]

# Defining X (features) and Y (target Class)

In [35]:
#X = df.drop(['expert assigned'], axis=1)  
X = df.drop(['expert assigned', 'title'], axis=1)

if(Krumhansl_Shmuckler !=  1):
    X = X.drop(['Krumhansl-Shmuckler'], axis=1)
if(simple_weights !=  1):
    X = X.drop(['simple weights'], axis=1)
if(Aarden_Essen !=  1):
    X = X.drop(['Aarden Essen'], axis=1)
if(Bellman_Budge !=  1):
    X = X.drop(['Bellman Budge'], axis=1)
if(Temperly_Kostka_Payne !=  1):  
    X = X.drop(['Temperly Kostka Payne'], axis=1)
if(as_transcribed !=  1):
    X = X.drop(['as transcribed'], axis=1)
if(final_note !=  1):
    X = X.drop(['final_note'], axis=1)
if(freq_note !=  1):
    X = X.drop(['freq note'], axis=1)
if(freq_weighted_acc !=  1):
    X = X.drop(['freq weighted acc'], axis=1)
if(certainty !=  1):
    X = X.drop(['certainty'], axis=1)    
if(root !=  1):
    X = X.drop(['root'], axis=1)
    
print("List of features considered: ", X.columns)

y = df['expert assigned']

# Taking 10% of the data out from the original to evluate the performance of the developed model

In [36]:
# split data into training and testing sets
from sklearn.model_selection import train_test_split
ModelDataset_X, EvalationDataset_X, ModelDataset_y, EvalationDataset_y = train_test_split(X, y, test_size = 0.1, random_state = 30)

In [37]:
print(EvalationDataset_y.value_counts())

# Preparing Datasets for experimentations

In [38]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

# Preparing dataset for Model Creation (It is kind of training dataet)
# first random sampling
oversample = RandomOverSampler(sampling_strategy='minority')
ModelDataset_X_oversample, ModelDataset_y_oversample = oversample.fit_resample(ModelDataset_X, ModelDataset_y)   
#print( ModelDataset_y_oversample.value_counts())

smt = SMOTE()
ModelDataset_X_smote, ModelDataset_y_smote = smt.fit_resample(ModelDataset_X_oversample, ModelDataset_y_oversample)


# Preparing dataset for Model Evaluation (It is kind of test dataet)
# first random sampling
oversample = RandomOverSampler(sampling_strategy='minority')
EvalationDataset_X_oversample, EvalationDataset_y_oversample = oversample.fit_resample(EvalationDataset_X, EvalationDataset_y)
    
# Counting Unique values of each class after random sampling
#smt = SMOTE()
EvalationDataset_X_smote, EvalationDataset_y_smote = smt.fit_resample(EvalationDataset_X_oversample, EvalationDataset_y_oversample)


# At this point we have the following datasets

(ModelDataset_X, ModelDataset_y)
(EvalationDataset_X, EvalationDataset_y)

(ModelDataset_X_smote, ModelDataset_y_smote)
(EvalationDataset_X_smote, EvalationDataset_y_smote)

In [39]:

# count unique values of each class
#print("count unique values of each class - ModelDataset_y")
#print(ModelDataset_y.value_counts())
ModelDataset_y_s = ModelDataset_y.value_counts()

#print("count unique values of each class - ModelDataset_y_smote") 
#print(ModelDataset_y_smote.value_counts())
ModelDataset_y_smote_s = ModelDataset_y_smote.value_counts()

#print("count unique values of each class - EvalationDataset_y") 
#print(EvalationDataset_y.value_counts())
EvalationDataset_y_s = EvalationDataset_y.value_counts()

#print("count unique values of each class - EvalationDataset_y_smote") 
#print(EvalationDataset_y_smote.value_counts())
EvalationDataset_y_smote_s = EvalationDataset_y_smote.value_counts()

mydf1 = pd.DataFrame({'note':ModelDataset_y_s.index, 'count':ModelDataset_y_s.values})
mydf2 = pd.DataFrame({'note':ModelDataset_y_smote_s.index, 'count':ModelDataset_y_smote_s.values})
mydf3 = pd.DataFrame({'note':EvalationDataset_y_s.index, 'count':EvalationDataset_y_s.values})
mydf4 = pd.DataFrame({'note':EvalationDataset_y_smote_s.index, 'count':EvalationDataset_y_smote_s.values})

mydf1 = mydf1.convert_dtypes(int)
mydf2 = mydf2.convert_dtypes(int)
mydf3 = mydf3.convert_dtypes(int)
mydf4 = mydf4.convert_dtypes(int)

# Distribution of model and evaluation datasets

In [40]:
mdf = pd.concat([mydf1, mydf2, mydf3, mydf4], axis=1,  keys=('ModelDataset_y','ModelDataset_y_smote', "EvalationDataset_y", "EvalationDataset_y_smote"))
mdf

# Classification report of state-of-the-art models for root note detection 

In [41]:

print("Classification Report - Krumhansl-Shmuckler")
print(classification_report(EvalationDataset_y, EvalationDataset_X["Krumhansl-Shmuckler"], labels=[0,2,4,7,9]))

print("Classification Report - simple weights")
print(classification_report(EvalationDataset_y, EvalationDataset_X["simple weights"], labels=[0,2,4,7,9]))

print("Classification Report - Aarden Essen")
print(classification_report(EvalationDataset_y, EvalationDataset_X["Aarden Essen"], labels=[0,2,4,7,9]))

print("Classification Report - Bellman Budge")
print(classification_report(EvalationDataset_y, EvalationDataset_X["Bellman Budge"], labels=[0,2,4,7,9]))

print("Classification Report - Temperly Kostka Payne")
print(classification_report(EvalationDataset_y, EvalationDataset_X["Temperly Kostka Payne"], labels=[0,2,4,7,9]))


print("Classification Report - as transcribed")
print(classification_report(EvalationDataset_y, EvalationDataset_X['as transcribed'], labels=[0,2,4,7,9]))


print("Classification Report - 'final_note'")
print(classification_report(EvalationDataset_y, EvalationDataset_X['final_note'], labels=[0,2,4,7,9]))


print("Classification Report - 'freq note'")
print(classification_report(EvalationDataset_y, EvalationDataset_X['freq note'], labels=[0,2,4,7,9]))

print("Classification Report - 'freq weighted acc'")
print(classification_report(EvalationDataset_y, EvalationDataset_X['freq weighted acc'], labels=[0,2,4,7,9]))


# Factorial Design Experimental Setup
**Grid-based hyperparameter tuning for developing the optimized model**

In [42]:
# import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

svm_param_grid = {
             'C': [0.1, 0.5, 1.0],
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
             'degree': [2, 3, 5],
             'gamma': ['auto', 'scale'],
             'tol': [1e-5, 1e-3, 1e-2],
             'max_iter': [-1, 5, 10]
}

RandomForest_param_grid = {
    'max_depth': [2, 3, 4, 5, 9, 10, 11,12, 15, 20, 22, 23, 30, 60],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 25],
    'criterion': ['gini', 'entropy'],
}


DecisionTree_param_grid = {
    'max_depth': [2, 3, 4, 5, 10, 15, 30, 60],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion': ['gini', 'entropy'],
}

NB_param_grid = {
                'var_smoothing': np.logspace(0,-9, num=100)
                }


model_param = {
  'DecisionTree':{
      'model': DecisionTreeClassifier(),
      'params': DecisionTree_param_grid,
  },
    
  'RandomForest': {
          'model': RandomForestClassifier(),
          'params': RandomForest_param_grid,
    },
    
  'NB': {
          'model': GaussianNB(),
          'params': NB_param_grid,
    },
   
  #'SVM': {
  #         'model':   svm.SVC(),
  #         'params': svm_param_grid,    
  # }    
}

In [None]:
# function to be called for evaluating each model defined above
def getBestModel (X, y):
    results = []
    for model, param in model_param.items():
        clf = GridSearchCV(param['model'], param['params'], cv= 10, scoring='f1_macro')
        clf.fit(X, y.ravel())
        results.append(
                            {
                            'model': param['model'],
                            'best_score': clf.best_score_,
                            'best_params': clf.best_params_,
                            }
                      )
    return results

# The following experimentation should be exectued twice 

1. **with `as transcribe` feature**
1. **without `as transcribe` feature** 

# Factorial Design Experiment - GridSearch - Implementation
# finding best models on orgininal dataset (ModelDataset_X, ModelDataset_y)

In [None]:
from sklearn.model_selection import GridSearchCV
#ModelDataset_X, ModelDataset_y,
#ModelDataset_X_smote, ModelDataset_y_smote,
#EvalationDataset_X, EvalationDataset_y,
#EvalationDataset_X_smote, EvalationDataset_y_smote
results = getBestModel(ModelDataset_X, ModelDataset_y)

print(results)

# loading best models of DecisionTree, RandomForest, and GaussianNB

In [None]:
result1 = pd.DataFrame(results)
result1.sort_values(by='best_score',ascending=False, inplace=True)
result1

In [None]:
# the static indices are used based on the above results
bestModel_DecisionTreeClassifier = results[0]['model']
bestModel_RandomForestClassifier = results[1]['model']
bestModel_GaussianNB = results[2]['model']

# Fitting the best models
**priting its classification report for further analysis**

In [None]:
bestModel_DecisionTreeClassifier.fit(ModelDataset_X, ModelDataset_y.ravel())
bestModel_RandomForestClassifier.fit(ModelDataset_X, ModelDataset_y.ravel())
bestModel_GaussianNB.fit(ModelDataset_X, ModelDataset_y.ravel())

print(bestModel_RandomForestClassifier)
print(bestModel_DecisionTreeClassifier)
print(bestModel_GaussianNB)

# Best models evaluation on unseen dataset (EvalationDataset_X and EvalationDataset_X_smote)

In [None]:
# apply the model
EvalationDataset_y_pred = bestModel_RandomForestClassifier.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_RandomForestClassifier.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))


EvalationDataset_y_pred = bestModel_DecisionTreeClassifier.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_DecisionTreeClassifier.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))

EvalationDataset_y_pred = bestModel_GaussianNB.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_GaussianNB.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))

In [None]:
bestModel = results[0]['model']
print(bestModel)

In [None]:
EvalationDataset_y
print("count unique values of each class - EvalationDataset_y") 
print(EvalationDataset_y.value_counts())


print("count unique values of each class - EvalationDataset_y_smote") 
print(EvalationDataset_y_smote.value_counts())


# Factorial Design Experiment - GridSearch - Implementation
**finding best models on orgininal dataset (ModelDataset_X_smote, ModelDataset_y_smote)**

In [None]:
my_results = getBestModel(ModelDataset_X_smote, ModelDataset_y_smote)

In [None]:
my_results1 = pd.DataFrame(my_results)
my_results1.sort_values(by='best_score',ascending=False, inplace=True)
my_results1

# Best models based on previous results

In [None]:
bestModel_s_DecisionTreeClassifier = my_results[0]['model']
bestModel_s_RandomForestClassifier = my_results[1]['model']
bestModel_s_GaussianNB = my_results[2]['model']

In [None]:
bestModel_s_DecisionTreeClassifier.fit(ModelDataset_X_smote, ModelDataset_y_smote.ravel())
bestModel_s_RandomForestClassifier.fit(ModelDataset_X_smote, ModelDataset_y_smote.ravel())
bestModel_s_GaussianNB.fit(ModelDataset_X_smote, ModelDataset_y_smote.ravel())

print(bestModel_s_DecisionTreeClassifier)
print(bestModel_s_RandomForestClassifier)
print(bestModel_s_GaussianNB)

In [None]:
bestModel2 = my_results[0]['model']
bestModel2.fit(ModelDataset_X_smote, ModelDataset_y_smote.ravel())
my_results = pd.DataFrame(my_results)
my_results.sort_values(by='best_score',ascending=False, inplace=True)
print("Results with smote ")
my_results

# Best models evaluation on unseen dataset (EvalationDataset_X and EvalationDataset_X_smote)

In [None]:
print("Randomforest - Classifier")

EvalationDataset_y_pred = bestModel_s_RandomForestClassifier.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_s_RandomForestClassifier.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))

print("Decision Tree - Classifier")
EvalationDataset_y_pred = bestModel_s_DecisionTreeClassifier.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_s_DecisionTreeClassifier.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))


print("Naive Bayes - Classifier")
EvalationDataset_y_pred = bestModel_s_GaussianNB.predict(EvalationDataset_X)
print(classification_report(EvalationDataset_y, EvalationDataset_y_pred, labels=[0,2,4,7,9]))
EvalationDataset_y_pred = bestModel_s_GaussianNB.predict(EvalationDataset_X_smote)
print(classification_report(EvalationDataset_y_smote, EvalationDataset_y_pred, labels=[0,2,4,7,9]))