# Test ML Models for Pairwise Classification

### Imports

In [6]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPClassifier

import seaborn as sns
from sklearn.metrics import classification_report
sns.set(style="darkgrid")

In [2]:
df_withLabels = pd.read_csv('../03_SourceFiles/03_ProcessedFiles/trimer-pair-wise-df.csv',
                            engine='c',
                            index_col=0,
                            low_memory=False)

In [3]:
df_withLabels

Unnamed: 0,l_sampleName,l_TYA,l_LDV,l_AWL,l_WLD,l_YAW,l_HQH,l_SGQ,l_DHQ,l_GQD,...,r_WMC,r_CFW,r_EMC,r_FCM,r_CWW,r_WWC,r_CWV,r_naiveLibrary,sibling,c_sampleName
0,20171128-71NYsaVH-VG-3__R1F2_RN2RP2,372.0,250.0,248.0,258.0,250.0,56.0,45.0,43.0,154.0,...,0.0,0.0,0.0,233.0,0.0,0.0,0.0,0,0,20171128-71NYsaVH-VG-3__R1F2_RN2RP2_^_20150922...
1,20161105-13OOicXZ-JW-3__R3F2_RN2RP1,17.0,11.0,0.0,4.0,0.0,11.0,12.0,9.0,21.0,...,0.0,0.0,0.0,70.0,0.0,0.0,0.0,0,0,20161105-13OOicXZ-JW-3__R3F2_RN2RP1_^_20160419...
2,20170228-22OOooNA-HD-3__R2F3_RN1RP4,15.0,14.0,8.0,43.0,28.0,86.0,36.0,6.0,6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,20170228-22OOooNA-HD-3__R2F3_RN1RP4_^_20170829...
3,20150819-07OObwUD-OO-3__R5F18_RN2RP2,52.0,1.0,7.0,14.0,0.0,2.0,6.0,1.0,27.0,...,2.0,2.0,1.0,2.0,1.0,0.0,1.0,0,0,20150819-07OObwUD-OO-3__R5F18_RN2RP2_^_2017082...
4,20170808-90OOooNA-HD-3__R3F8_RN1RP3,33.0,3.0,8.0,10.0,1.0,25.0,3.0,15.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,20170808-90OOooNA-HD-3__R3F8_RN1RP3_^_20170601...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22583,20170202-71MZdcEB-DF-3__R7F1_RN1RP1,617.0,491.0,317.0,307.0,275.0,228.0,313.0,196.0,228.0,...,3.0,3.0,7.0,11.0,3.0,3.0,4.0,0,1,20170202-71MZdcEB-DF-3__R7F1_RN1RP1_^_20170202...
22584,20170202-71MZdcEB-DF-3__R7F2_RN1RP2,569.0,456.0,323.0,282.0,260.0,229.0,312.0,197.0,206.0,...,3.0,4.0,11.0,8.0,1.0,3.0,5.0,0,1,20170202-71MZdcEB-DF-3__R7F2_RN1RP2_^_20170202...
22585,20170202-71MZdcEB-DF-3__R7F2_RN1RP2,569.0,456.0,323.0,282.0,260.0,229.0,312.0,197.0,206.0,...,3.0,3.0,7.0,11.0,3.0,3.0,4.0,0,1,20170202-71MZdcEB-DF-3__R7F2_RN1RP2_^_20170202...
22586,20170202-71MZdcEB-DF-3__R7F3_RN1RP3,517.0,402.0,238.0,213.0,205.0,250.0,339.0,211.0,214.0,...,3.0,4.0,11.0,8.0,1.0,3.0,5.0,0,1,20170202-71MZdcEB-DF-3__R7F3_RN1RP3_^_20170202...


In [4]:
# X - data without labels
# y - only labels
X = df_withLabels.drop(['l_sampleName', 'r_sampleName', 'c_sampleName', 'sibling'], axis = 1)
y = df_withLabels["sibling"].copy()

In [11]:
kneigh = KNeighborsClassifier()
pred_kneigh = cross_val_predict(kneigh, X, y, verbose=1, cv=3, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 16.8min finished


In [13]:
print(classification_report(y, pred_kneigh))

precision    recall  f1-score   support

           0       0.64      0.84      0.72     12000
           1       0.71      0.46      0.56     10588

    accuracy                           0.66     22588
   macro avg       0.67      0.65      0.64     22588
weighted avg       0.67      0.66      0.64     22588



In [None]:
kneigh2_scd = make_pipeline(MinMaxScaler(),
                            KNeighborsClassifier(n_jobs=-1))
score_kn2_scd = cross_val_score(kneigh2_scd, X, y, cv=3, n_jobs=-1)
print('KNeighbors Coompleted')

In [9]:
kneigh2_scd = make_pipeline(MinMaxScaler(),
                            KNeighborsClassifier(n_neighbors=2, n_jobs=-1))

lreg_scd = make_pipeline(MinMaxScaler(),
                         LogisticRegression(max_iter=300,
                                            multi_class='ovr',
                                            solver='liblinear'))

nn_scd = make_pipeline(MinMaxScaler(),
                            MLPClassifier(alpha=1,
                                          max_iter=1000,
                                          solver='lbfgs'))

treeClf = DecisionTreeClassifier(criterion='entropy')
treeClf_scd = make_pipeline(MinMaxScaler(), DecisionTreeClassifier(criterion='entropy'))

svc_scd = make_pipeline(MinMaxScaler(),
                        SVC(kernel='linear', decision_function_shape='ovr'))

# ###########################

score_kn2_scd = cross_val_score(kneigh2_scd, X, y, cv=5, n_jobs=-1)
print('KNeighbors Coompleted')

score_lr_scd = cross_val_score(lreg_scd, X, y, cv=5, n_jobs=-1)
print('Linear Regression Completed')

score_nn_scd = cross_val_score(nn_scd, X, y, cv=5, n_jobs=-1)
print('NN Completed')

score_treeClf = cross_val_score(treeClf, X, y, cv=5, n_jobs=-1)
score_treeClf_scd = cross_val_score(treeClf_scd, X, y, cv=5, n_jobs=-1)
print('TreeCLF Completed')

score_svc_scd = cross_val_score(svc_scd, X, y, cv=5, n_jobs=-1)
print('Support Vector Machine Completed')

In [None]:
score_dict = {
    'KN-2-Scaled': score_kn2_scd,
    'LR-Scaled': score_lr_scd,
    'NN-MLP-Scaled': score_nn_scd,
    'TC-NS': score_treeClf,
    'TC-Scaled': score_treeClf_scd,
    'SVM-Scaled': score_svc_scd,
}

In [None]:
score_dict_df = pd.DataFrame(score_dict)

In [None]:
score_dict_df.plot(kind='box', figsize=(25, 15),
                   title="Model Accuracy Scores for Naive Library Classification - Trimers")
plt.savefig('trimer-pairwise-scores-august-7.png', bbox_inches='tight', format='png')

In [None]:
kneigh2_scd = make_pipeline(MinMaxScaler(),
                            KNeighborsClassifier(n_neighbors=2, n_jobs=-1))

lreg_scd = make_pipeline(MinMaxScaler(),
                         LogisticRegression(max_iter=300,
                                            multi_class='ovr',
                                            solver='liblinear'))

nn_scd = make_pipeline(MinMaxScaler(),
                            MLPClassifier(alpha=1,
                                          max_iter=1000, 
                                          solver='lbfgs'))

treeClf = DecisionTreeClassifier(criterion='entropy')
treeClf_scd = make_pipeline(MinMaxScaler(), DecisionTreeClassifier(criterion='entropy'))

svc_scd = make_pipeline(MinMaxScaler(),
                        SVC(kernel='linear', decision_function_shape='ovr'))

# ###########################

pred_kn2_scd = cross_val_predict(kneigh2_scd, X, y, cv=5, n_jobs=-1)
print('KNeighbors Coompleted')

pred_lr_scd = cross_val_predict(lreg_scd, X, y, cv=5, n_jobs=-1)
print('Linear Regression Completed')

pred_nn_scd = cross_val_predict(nn_scd, X, y, cv=5, n_jobs=-1)
print('NN Completed')

pred_treeClf = cross_val_predict(treeClf, X, y, cv=5, n_jobs=-1)
pred_treeClf_scd = cross_val_predict(treeClf_scd, X, y, cv=5, n_jobs=-1)
print('TreeCLF Completed')

pred_svc_scd = cross_val_predict(svc_scd, X, y, cv=5, n_jobs=-1)
print('Support Vector Machine Completed')

In [14]:
lreg_scd = make_pipeline(MinMaxScaler(),
                         LogisticRegression(max_iter=300,
                                            multi_class='ovr',
                                            solver='liblinear'))

pred_lr_scd = cross_val_predict(lreg_scd, X, y, cv=3, n_jobs=-1)
print('Linear Regression Completed')

Linear Regression Completed


In [16]:
print(classification_report(y, pred_lr_scd))

precision    recall  f1-score   support

           0       0.47      0.70      0.56     12000
           1       0.23      0.10      0.14     10588

    accuracy                           0.42     22588
   macro avg       0.35      0.40      0.35     22588
weighted avg       0.36      0.42      0.36     22588

