In [1]:
## Utils
import re
import numpy as np
import pandas as pd
import itertools

## Classical Learner
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate

In [2]:
"""Dataset Parameter Setting"""
"""Load Dataset"""
cdcl2_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_cdcl2_b1_bn_bl_corrected.csv")
cdcl2_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_cdcl2_b2_bn_bl_corrected.csv")

pbcl2_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbcl2_b1_bn_bl_corrected.csv")
pbcl2_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbcl2_b2_bn_bl_corrected.csv")

pbno32_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbno32_b1_bn_bl_corrected.csv")
pbno32_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbno32_b2_bn_bl_corrected.csv")


"""Set Output Path"""
fileout = "C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/3results/HM_Classification/raw_data/baseline_binary_hm_model_output_b2_to_b1_bn_bl_corrected.csv"

In [3]:
cdcl2_b1_dset = pd.concat([pd.DataFrame(cdcl2_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           cdcl2_b1_dset.iloc[:, 5:]], axis=1)
cdcl2_b2_dset = pd.concat([pd.DataFrame(cdcl2_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           cdcl2_b2_dset.iloc[:, 5:]], axis=1)
cdcl2_all_dset = pd.concat([cdcl2_b1_dset, cdcl2_b2_dset], axis=0).reset_index(drop=True)

pbcl2_b1_dset = pd.concat([pd.DataFrame(pbcl2_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           pbcl2_b1_dset.iloc[:, 5:]], axis=1)
pbcl2_b2_dset = pd.concat([pd.DataFrame(pbcl2_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           pbcl2_b2_dset.iloc[:, 5:]], axis=1)
pbcl2_all_dset = pd.concat([pbcl2_b1_dset, pbcl2_b2_dset], axis=0).reset_index(drop=True)

pbno32_b1_dset = pd.concat([pd.DataFrame(pbno32_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                            pbno32_b1_dset.iloc[:, 5:]], axis=1)
pbno32_b2_dset = pd.concat([pd.DataFrame(pbno32_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}),
                            pbno32_b2_dset.iloc[:, 5:]], axis=1)
pbno32_all_dset = pd.concat([pbno32_b1_dset, pbno32_b2_dset], axis=0).reset_index(drop=True)

In [4]:
## CdCl2 Dataset
tmp1 = np.array(cdcl2_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(cdcl2_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

cdcl2_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), cdcl2_b1_dset.iloc[:,1:]], axis=1)
cdcl2_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), cdcl2_b2_dset.iloc[:,1:]], axis=1)

In [5]:
## Pbcl2 Dataset
tmp1 = np.array(pbcl2_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(pbcl2_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

pbcl2_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), pbcl2_b1_dset.iloc[:,1:]], axis=1)
pbcl2_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), pbcl2_b2_dset.iloc[:,1:]], axis=1)

In [6]:
## Pb(NO3)2 Dataset
tmp1 = np.array(pbno32_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(pbno32_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

pbno32_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), pbno32_b1_dset.iloc[:,1:]], axis=1)
pbno32_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), pbno32_b2_dset.iloc[:,1:]], axis=1)

In [7]:
cdcl2_train_set = cdcl2_b2_dset.iloc[:,:-1]
cdcl2_test_set = cdcl2_b1_dset.iloc[:,:-1]
pbcl2_train_set = pbcl2_b2_dset.iloc[:,:-1]
pbcl2_test_set = pbcl2_b1_dset.iloc[:,:-1]
pbno32_train_set = pbno32_b2_dset.iloc[:,:-1]
pbno32_test_set = pbno32_b1_dset.iloc[:,:-1]

In [8]:
tmp_dt = pbcl2_train_set[pbcl2_train_set.label==1]
pbcl2_test_set = pd.concat([pbcl2_test_set, tmp_dt.iloc[0:1,:]], axis=0).reset_index(drop=True)

tmp_dt = pbno32_train_set[pbno32_train_set.label==1]
pbno32_test_set = pd.concat([pbno32_test_set, tmp_dt.iloc[0:1,:]], axis=0).reset_index(drop=True)

In [9]:
X_cdcl2_dset = cdcl2_train_set.iloc[:, 1:].to_numpy(dtype='float32')
X_cdcl2_tset = cdcl2_test_set.iloc[:, 1:].to_numpy(dtype='float32')
y_cdcl2_dset = cdcl2_train_set.iloc[:,0].to_numpy(dtype='int64') 
y_cdcl2_tset = cdcl2_test_set.iloc[:,0].to_numpy(dtype='int64') 


X_pbcl2_dset = pbcl2_train_set.iloc[:, 1:].to_numpy(dtype='float32')
X_pbcl2_tset = pbcl2_test_set.iloc[:, 1:].to_numpy(dtype='float32')
y_pbcl2_dset = pbcl2_train_set.iloc[:,0].to_numpy(dtype='int64') 
y_pbcl2_tset = pbcl2_test_set.iloc[:,0].to_numpy(dtype='int64') 

X_pbno32_dset = pbno32_train_set.iloc[:, 1:].to_numpy(dtype='float32')
X_pbno32_tset = pbno32_test_set.iloc[:, 1:].to_numpy(dtype='float32')
y_pbno32_dset = pbno32_train_set.iloc[:,0].to_numpy(dtype='int64') 
y_pbno32_tset = pbno32_test_set.iloc[:,0].to_numpy(dtype='int64') 

## 2. Train Test Split

In [10]:
## 4.4 train and test split for Classical Learner
X_cdcl2_train, X_cdcl2_test, y_cdcl2_train, y_cdcl2_test = train_test_split(X_cdcl2_dset, y_cdcl2_dset, test_size = 0.2, 
                                                    random_state=123)

X_pbcl2_train, X_pbcl2_test, y_pbcl2_train, y_pbcl2_test = train_test_split(X_pbcl2_dset, y_pbcl2_dset, test_size = 0.2, 
                                                    random_state=123)

X_pbno32_train, X_pbno32_test, y_pbno32_train, y_pbno32_test = train_test_split(X_pbno32_dset, y_pbno32_dset, test_size = 0.2, 
                                                    random_state=123)

## 3. Classical Learner

### 3.1 Naive Bayes

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB

In [12]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [13]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[266  30]
 [ 17  87]]
[1mMetrics[0m
ACC: 0.882
BACC: 0.868
F1: 0.787
AUROC: 0.937
AP: 0.747
MCC: 0.709
Precision: 0.744
Recall: 0.837


In [14]:
NB_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [15]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[349 651]
 [482  18]]
[1mMetrics[0m
ACC: 0.245
BACC: 0.192
F1: 0.031
AUROC: 0.254
AP: 0.253
MCC: -0.583
Precision: 0.027
Recall: 0.036


In [16]:
NB_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [17]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_pbcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[294  17]
 [  0  89]]
[1mMetrics[0m
ACC: 0.958
BACC: 0.973
F1: 0.913
AUROC: 0.981
AP: 0.881
MCC: 0.891
Precision: 0.84
Recall: 1.0


In [19]:
NB_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [20]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1106  394]
 [   0    1]]
[1mMetrics[0m
ACC: 0.738
BACC: 0.869
F1: 0.005
AUROC: 0.87
AP: 0.003
MCC: 0.043
Precision: 0.003
Recall: 1.0


In [21]:
NB_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [22]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [23]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[291  20]
 [  8  81]]
[1mMetrics[0m
ACC: 0.93
BACC: 0.923
F1: 0.853
AUROC: 0.958
AP: 0.789
MCC: 0.81
Precision: 0.802
Recall: 0.91


In [24]:
NB_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [25]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1061  439]
 [   0    1]]
[1mMetrics[0m
ACC: 0.708
BACC: 0.854
F1: 0.005
AUROC: 0.854
AP: 0.002
MCC: 0.04
Precision: 0.002
Recall: 1.0


In [26]:
NB_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [27]:
outF = open(fileout, "w")
outF.write("Naive_Bayes, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, NB_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, NB_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, NB_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, NB_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, NB_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, NB_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.2 Decision Tree

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [29]:
## Cdcl2 Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[295   1]
 [  0 104]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.998
F1: 0.995
AUROC: 0.998
AP: 0.99
MCC: 0.994
Precision: 0.99
Recall: 1.0


In [31]:
DT_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [32]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[870 130]
 [337 163]]
[1mMetrics[0m
ACC: 0.689
BACC: 0.598
F1: 0.411
AUROC: 0.598
AP: 0.406
MCC: 0.233
Precision: 0.556
Recall: 0.326


In [33]:
DT_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [34]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [35]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [36]:
DT_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [37]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1097  403]
 [   0    1]]
[1mMetrics[0m
ACC: 0.732
BACC: 0.866
F1: 0.005
AUROC: 0.866
AP: 0.002
MCC: 0.043
Precision: 0.002
Recall: 1.0


In [38]:
DT_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [39]:
## Learn Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [40]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  1  88]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.994
F1: 0.994
AUROC: 0.994
AP: 0.991
MCC: 0.993
Precision: 1.0
Recall: 0.989


In [41]:
DT_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [42]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1204  296]
 [   0    1]]
[1mMetrics[0m
ACC: 0.803
BACC: 0.901
F1: 0.007
AUROC: 0.901
AP: 0.003
MCC: 0.052
Precision: 0.003
Recall: 1.0


In [43]:
DT_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [44]:
outF = open(fileout, "a")
outF.write("Decision_Tree, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, DT_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, DT_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, DT_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, DT_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, DT_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, DT_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.2 Logistic Regression

In [45]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [46]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[296   0]
 [  0 104]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [48]:
LR_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [49]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[830 170]
 [418  82]]
[1mMetrics[0m
ACC: 0.608
BACC: 0.497
F1: 0.218
AUROC: 0.261
AP: 0.33
MCC: -0.008
Precision: 0.325
Recall: 0.164


In [50]:
LR_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [51]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [53]:
LR_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [54]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1173  327]
 [   0    1]]
[1mMetrics[0m
ACC: 0.782
BACC: 0.891
F1: 0.006
AUROC: 1.0
AP: 1.0
MCC: 0.049
Precision: 0.003
Recall: 1.0


In [55]:
LR_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [56]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[309   2]
 [  0  89]]
[1mMetrics[0m
ACC: 0.995
BACC: 0.997
F1: 0.989
AUROC: 0.997
AP: 0.987
MCC: 0.986
Precision: 0.978
Recall: 1.0


In [58]:
LR_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [59]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1031  469]
 [   0    1]]
[1mMetrics[0m
ACC: 0.688
BACC: 0.844
F1: 0.004
AUROC: 0.815
AP: 0.004
MCC: 0.038
Precision: 0.002
Recall: 1.0


In [60]:
LR_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [61]:
outF = open(fileout, "a")
outF.write("Logistic_Regression, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LR_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, LR_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LR_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, LR_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LR_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, LR_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.3 Logistic Regression with CV

In [62]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

In [63]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [64]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[296   0]
 [  0 104]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [65]:
LR_CV_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [66]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[835 165]
 [418  82]]
[1mMetrics[0m
ACC: 0.611
BACC: 0.5
F1: 0.22
AUROC: 0.264
AP: 0.334
MCC: -0.001
Precision: 0.332
Recall: 0.164


In [67]:
LR_CV_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [68]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [69]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [70]:
LR_CV_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [71]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1204  296]
 [   0    1]]
[1mMetrics[0m
ACC: 0.803
BACC: 0.901
F1: 0.007
AUROC: 1.0
AP: 1.0
MCC: 0.052
Precision: 0.003
Recall: 1.0


In [72]:
LR_CV_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [73]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [74]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[309   2]
 [  0  89]]
[1mMetrics[0m
ACC: 0.995
BACC: 0.997
F1: 0.989
AUROC: 0.997
AP: 0.988
MCC: 0.986
Precision: 0.978
Recall: 1.0


In [75]:
LR_CV_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [76]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1071  429]
 [   0    1]]
[1mMetrics[0m
ACC: 0.714
BACC: 0.857
F1: 0.005
AUROC: 0.898
AP: 0.006
MCC: 0.041
Precision: 0.002
Recall: 1.0


In [77]:
LR_CV_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [78]:
outF = open(fileout, "a")
outF.write("Logistic_Regression_CV, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, LR_CV_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, LR_CV_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, LR_CV_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.4 MLP

In [79]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [80]:
## Learn Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [81]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[296   0]
 [  0 104]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [82]:
MLP_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [83]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[998   2]
 [456  44]]
[1mMetrics[0m
ACC: 0.695
BACC: 0.543
F1: 0.161
AUROC: 0.568
AP: 0.496
MCC: 0.235
Precision: 0.957
Recall: 0.088


In [84]:
MLP_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [85]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [86]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [87]:
MLP_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [88]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1153  347]
 [   0    1]]
[1mMetrics[0m
ACC: 0.769
BACC: 0.884
F1: 0.006
AUROC: 1.0
AP: 1.0
MCC: 0.047
Precision: 0.003
Recall: 1.0


In [89]:
MLP_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [90]:
## Learn Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [91]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[310   1]
 [  0  89]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.998
F1: 0.994
AUROC: 1.0
AP: 1.0
MCC: 0.993
Precision: 0.989
Recall: 1.0


In [92]:
MLP_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [93]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1075  425]
 [   0    1]]
[1mMetrics[0m
ACC: 0.717
BACC: 0.858
F1: 0.005
AUROC: 0.999
AP: 0.333
MCC: 0.041
Precision: 0.002
Recall: 1.0


In [94]:
MLP_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [95]:
outF = open(fileout, "a")
outF.write("MLP, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, MLP_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, MLP_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, MLP_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, MLP_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, MLP_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, MLP_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.5 Random Forest

In [96]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [97]:
## Learn Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [98]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[296   0]
 [  0 104]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [99]:
RF_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [100]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[982  18]
 [489  11]]
[1mMetrics[0m
ACC: 0.662
BACC: 0.502
F1: 0.042
AUROC: 0.392
AP: 0.286
MCC: 0.014
Precision: 0.379
Recall: 0.022


In [101]:
RF_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [102]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [103]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [104]:
RF_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [105]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1149  351]
 [   0    1]]
[1mMetrics[0m
ACC: 0.766
BACC: 0.883
F1: 0.006
AUROC: 1.0
AP: 1.0
MCC: 0.047
Precision: 0.003
Recall: 1.0


In [106]:
RF_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [107]:
## Learn Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [108]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[310   1]
 [  0  89]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.998
F1: 0.994
AUROC: 1.0
AP: 1.0
MCC: 0.993
Precision: 0.989
Recall: 1.0


In [109]:
RF_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [110]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1224  276]
 [   0    1]]
[1mMetrics[0m
ACC: 0.816
BACC: 0.908
F1: 0.007
AUROC: 1.0
AP: 1.0
MCC: 0.054
Precision: 0.004
Recall: 1.0


In [111]:
RF_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [112]:
outF = open(fileout, "a")
outF.write("Random_Forest, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, RF_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, RF_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, RF_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, RF_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, RF_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, RF_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.6 Linear SVM

In [113]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [114]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [115]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[296   0]
 [  0 104]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [116]:
LinSVM_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [117]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[859 141]
 [418  82]]
[1mMetrics[0m
ACC: 0.627
BACC: 0.511
F1: 0.227
AUROC: 0.277
AP: 0.358
MCC: 0.03
Precision: 0.368
Recall: 0.164


In [118]:
LinSVM_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [119]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [120]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [121]:
LinSVM_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [122]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1185  315]
 [   0    1]]
[1mMetrics[0m
ACC: 0.79
BACC: 0.895
F1: 0.006
AUROC: 1.0
AP: 1.0
MCC: 0.05
Precision: 0.003
Recall: 1.0


In [123]:
LinSVM_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [124]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [125]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[309   2]
 [  0  89]]
[1mMetrics[0m
ACC: 0.995
BACC: 0.997
F1: 0.989
AUROC: 0.999
AP: 0.996
MCC: 0.986
Precision: 0.978
Recall: 1.0


In [126]:
LinSVM_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [127]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1094  406]
 [   0    1]]
[1mMetrics[0m
ACC: 0.73
BACC: 0.865
F1: 0.005
AUROC: 0.915
AP: 0.008
MCC: 0.042
Precision: 0.002
Recall: 1.0


In [128]:
LinSVM_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [129]:
outF = open(fileout, "a")
outF.write("Linear_SVM, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, LinSVM_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, LinSVM_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, LinSVM_pbno32_tset_res)))
outF.write('\n')
outF.close()

### 3.7 RBF SVM

In [130]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [131]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
X_tset = X_cdcl2_tset
y_tset = y_cdcl2_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [132]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[295   1]
 [  0 104]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.998
F1: 0.995
AUROC: 1.0
AP: 1.0
MCC: 0.994
Precision: 0.99
Recall: 1.0


In [133]:
RBFSVM_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [134]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[993   7]
 [463  37]]
[1mMetrics[0m
ACC: 0.687
BACC: 0.534
F1: 0.136
AUROC: 0.303
AP: 0.344
MCC: 0.187
Precision: 0.841
Recall: 0.074


In [135]:
RBFSVM_cdcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [136]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
X_tset = X_pbcl2_tset
y_tset = y_pbcl2_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [137]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[311   0]
 [  0  89]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [138]:
RBFSVM_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [139]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1158  342]
 [   0    1]]
[1mMetrics[0m
ACC: 0.772
BACC: 0.886
F1: 0.006
AUROC: 1.0
AP: 1.0
MCC: 0.047
Precision: 0.003
Recall: 1.0


In [140]:
RBFSVM_pbcl2_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [141]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
X_tset = X_pbno32_tset
y_tset = y_pbno32_tset
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [142]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[310   1]
 [  0  89]]
[1mMetrics[0m
ACC: 0.998
BACC: 0.998
F1: 0.994
AUROC: 1.0
AP: 1.0
MCC: 0.993
Precision: 0.989
Recall: 1.0


In [143]:
RBFSVM_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [144]:
### Independent Test Result
X_tset_sds = scaler.transform(X_tset)
yp_test = clf.predict(X_tset_sds)
ys_test = clf.predict_proba(X_tset_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_tset, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_tset, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_tset, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_tset, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_tset, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_tset, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_tset, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_tset, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_tset, yp_test),3)))

[1mConfusion Matrix[0m
[[1223  277]
 [   0    1]]
[1mMetrics[0m
ACC: 0.815
BACC: 0.908
F1: 0.007
AUROC: 1.0
AP: 1.0
MCC: 0.054
Precision: 0.004
Recall: 1.0


In [145]:
RBFSVM_pbno32_tset_res = [round(accuracy_score(y_tset, yp_test), 3), round(balanced_accuracy_score(y_tset, yp_test), 3),
                 round(f1_score(y_tset, yp_test),3), round(roc_auc_score(y_tset, ys_test),3),
                 round(average_precision_score(y_tset, ys_test),3), round(matthews_corrcoef(y_tset, yp_test),3),
                 round(precision_score(y_tset, yp_test),3), round(recall_score(y_tset, yp_test),3)]

In [146]:
outF = open(fileout, "a")
outF.write("RBF_SVM, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_cdcl2_devset_res)))
outF.write('\n')
outF.write('Cdcl2 IndSet, ')
outF.write(', '.join(map(str, RBFSVM_cdcl2_tset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 IndSet, ')
outF.write(', '.join(map(str, RBFSVM_pbcl2_tset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_pbno32_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 IndSet, ')
outF.write(', '.join(map(str, RBFSVM_pbno32_tset_res)))
outF.write('\n')
outF.close()