In [1]:
## Utils
import re
import numpy as np
import pandas as pd
import itertools

## Classical Learner
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate

In [2]:
"""Dataset Parameter Setting"""
"""Load Dataset"""
cdcl2_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_cdcl2_b1_bn_bl_corrected.csv")
cdcl2_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_cdcl2_b2_bn_bl_corrected.csv")

pbcl2_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbcl2_b1_bn_bl_corrected.csv")
pbcl2_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbcl2_b2_bn_bl_corrected.csv")

pbno32_b1_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbno32_b1_bn_bl_corrected.csv")
pbno32_b2_dset = pd.read_csv("C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/2data/_preprocessed/sersnet_pbno32_b2_bn_bl_corrected.csv")


"""Set Output Path"""
fileout = "C:/Users/sypark/Desktop/Projects/w_MinSeok/1SERSNet/3results/HM_Classification/raw_data/baseline_binary_hm_model_output_b2_to_b1_bn_bl_corrected_batch_combine.csv"

In [3]:
cdcl2_b1_dset = pd.concat([pd.DataFrame(cdcl2_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           cdcl2_b1_dset.iloc[:, 5:]], axis=1)
cdcl2_b2_dset = pd.concat([pd.DataFrame(cdcl2_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           cdcl2_b2_dset.iloc[:, 5:]], axis=1)
cdcl2_all_dset = pd.concat([cdcl2_b1_dset, cdcl2_b2_dset], axis=0).reset_index(drop=True)

pbcl2_b1_dset = pd.concat([pd.DataFrame(pbcl2_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           pbcl2_b1_dset.iloc[:, 5:]], axis=1)
pbcl2_b2_dset = pd.concat([pd.DataFrame(pbcl2_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                           pbcl2_b2_dset.iloc[:, 5:]], axis=1)
pbcl2_all_dset = pd.concat([pbcl2_b1_dset, pbcl2_b2_dset], axis=0).reset_index(drop=True)

pbno32_b1_dset = pd.concat([pd.DataFrame(pbno32_b1_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}), 
                            pbno32_b1_dset.iloc[:, 5:]], axis=1)
pbno32_b2_dset = pd.concat([pd.DataFrame(pbno32_b2_dset.iloc[:, 3]).rename(columns={'Concentration_uM': 'label'}),
                            pbno32_b2_dset.iloc[:, 5:]], axis=1)
pbno32_all_dset = pd.concat([pbno32_b1_dset, pbno32_b2_dset], axis=0).reset_index(drop=True)

In [4]:
## CdCl2 Dataset
tmp1 = np.array(cdcl2_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(cdcl2_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

cdcl2_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), cdcl2_b1_dset.iloc[:,1:]], axis=1)
cdcl2_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), cdcl2_b2_dset.iloc[:,1:]], axis=1)
cdcl2_dset = pd.concat([cdcl2_b1_dset, cdcl2_b2_dset], axis=0).reset_index(drop=True)

In [5]:
## Pbcl2 Dataset
tmp1 = np.array(pbcl2_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(pbcl2_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

pbcl2_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), pbcl2_b1_dset.iloc[:,1:]], axis=1)
pbcl2_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), pbcl2_b2_dset.iloc[:,1:]], axis=1)
pbcl2_dset = pd.concat([pbcl2_b1_dset, pbcl2_b2_dset], axis=0).reset_index(drop=True)

In [6]:
## Pb(NO3)2 Dataset
tmp1 = np.array(pbno32_b1_dset.iloc[:,0] <=1e-2, dtype='int64')
tmp2 = np.array(pbno32_b2_dset.iloc[:,0] <=1e-2, dtype='int64')

pbno32_b1_dset = pd.concat([pd.DataFrame(tmp1).rename(columns={0:'label'}), pbno32_b1_dset.iloc[:,1:]], axis=1)
pbno32_b2_dset = pd.concat([pd.DataFrame(tmp2).rename(columns={0:'label'}), pbno32_b2_dset.iloc[:,1:]], axis=1)
pbno32_dset = pd.concat([pbno32_b1_dset, pbno32_b2_dset], axis=0).reset_index(drop=True)

In [7]:
cdcl2_dset = cdcl2_dset.iloc[:,:-1]
pbcl2_dset = pbcl2_dset.iloc[:,:-1]
pbno32_dset = pbno32_dset.iloc[:,:-1]

In [8]:
X_cdcl2_dset = cdcl2_dset.iloc[:, 1:].to_numpy(dtype='float32')
y_cdcl2_dset = cdcl2_dset.iloc[:,0].to_numpy(dtype='int64') 

X_pbcl2_dset = pbcl2_dset.iloc[:, 1:].to_numpy(dtype='float32')
y_pbcl2_dset = pbcl2_dset.iloc[:,0].to_numpy(dtype='int64') 

X_pbno32_dset = pbno32_dset.iloc[:, 1:].to_numpy(dtype='float32')
y_pbno32_dset = pbno32_dset.iloc[:,0].to_numpy(dtype='int64') 

## 2. Train Test Split

In [9]:
## 4.4 train and test split for Classical Learner
X_cdcl2_train, X_cdcl2_test, y_cdcl2_train, y_cdcl2_test = train_test_split(X_cdcl2_dset, y_cdcl2_dset, test_size = 0.2, 
                                                    random_state=123)

X_pbcl2_train, X_pbcl2_test, y_pbcl2_train, y_pbcl2_test = train_test_split(X_pbcl2_dset, y_pbcl2_dset, test_size = 0.2, 
                                                    random_state=123)

X_pbno32_train, X_pbno32_test, y_pbno32_train, y_pbno32_test = train_test_split(X_pbno32_dset, y_pbno32_dset, test_size = 0.2, 
                                                    random_state=123)

## 3. Classical Learner

### 3.1 Naive Bayes

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB

In [11]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[361 149]
 [ 68 122]]
[1mMetrics[0m
ACC: 0.69
BACC: 0.675
F1: 0.529
AUROC: 0.751
AP: 0.441
MCC: 0.319
Precision: 0.45
Recall: 0.642


In [13]:
NB_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [14]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_pbcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [15]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[524  68]
 [  0 108]]
[1mMetrics[0m
ACC: 0.903
BACC: 0.943
F1: 0.761
AUROC: 0.944
AP: 0.621
MCC: 0.737
Precision: 0.614
Recall: 1.0


In [16]:
NB_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [17]:
## Learn Classifier
scaler = StandardScaler()
clf = BernoulliNB()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[490 102]
 [  5 103]]
[1mMetrics[0m
ACC: 0.847
BACC: 0.891
F1: 0.658
AUROC: 0.91
AP: 0.507
MCC: 0.62
Precision: 0.502
Recall: 0.954


In [19]:
NB_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [20]:
outF = open(fileout, "w")
outF.write("Naive_Bayes, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, NB_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, NB_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, NB_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.2 Decision Tree

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [22]:
## Cdcl2 Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [23]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[497  13]
 [ 10 180]]
[1mMetrics[0m
ACC: 0.967
BACC: 0.961
F1: 0.94
AUROC: 0.961
AP: 0.898
MCC: 0.917
Precision: 0.933
Recall: 0.947


In [24]:
DT_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [25]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [27]:
DT_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [28]:
## Learn Classifier
scaler = StandardScaler()
clf = DecisionTreeClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[589   3]
 [  4 104]]
[1mMetrics[0m
ACC: 0.99
BACC: 0.979
F1: 0.967
AUROC: 0.979
AP: 0.942
MCC: 0.962
Precision: 0.972
Recall: 0.963


In [30]:
DT_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [31]:
outF = open(fileout, "a")
outF.write("Decision_Tree, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, DT_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, DT_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, DT_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.2 Logistic Regression

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [33]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[504   6]
 [  0 190]]
[1mMetrics[0m
ACC: 0.991
BACC: 0.994
F1: 0.984
AUROC: 1.0
AP: 0.999
MCC: 0.979
Precision: 0.969
Recall: 1.0


In [35]:
LR_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [36]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [38]:
LR_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [39]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegression()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [41]:
LR_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [42]:
outF = open(fileout, "a")
outF.write("Logistic_Regression, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LR_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LR_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LR_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.3 Logistic Regression with CV

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

In [44]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [45]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[504   6]
 [  0 190]]
[1mMetrics[0m
ACC: 0.991
BACC: 0.994
F1: 0.984
AUROC: 1.0
AP: 0.999
MCC: 0.979
Precision: 0.969
Recall: 1.0


In [46]:
LR_CV_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [47]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [48]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [49]:
LR_CV_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [50]:
## Learn Classifier
scaler = StandardScaler()
clf = LogisticRegressionCV()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [51]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [52]:
LR_CV_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [53]:
outF = open(fileout, "a")
outF.write("Logistic_Regression_CV, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LR_CV_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.4 MLP

In [54]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [55]:
## Learn Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [56]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[507   3]
 [  0 190]]
[1mMetrics[0m
ACC: 0.996
BACC: 0.997
F1: 0.992
AUROC: 1.0
AP: 1.0
MCC: 0.989
Precision: 0.984
Recall: 1.0


In [57]:
MLP_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [58]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [59]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [60]:
MLP_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [61]:
## Learn Classifier
scaler = StandardScaler()
clf = MLPClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [62]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [63]:
MLP_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [64]:
outF = open(fileout, "a")
outF.write("MLP, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, MLP_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, MLP_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, MLP_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.5 Random Forest

In [65]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [66]:
## Learn Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[507   3]
 [  0 190]]
[1mMetrics[0m
ACC: 0.996
BACC: 0.997
F1: 0.992
AUROC: 1.0
AP: 1.0
MCC: 0.989
Precision: 0.984
Recall: 1.0


In [68]:
RF_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [69]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [70]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  1 107]]
[1mMetrics[0m
ACC: 0.999
BACC: 0.995
F1: 0.995
AUROC: 1.0
AP: 1.0
MCC: 0.995
Precision: 1.0
Recall: 0.991


In [71]:
RF_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [72]:
## Learn Classifier
scaler = StandardScaler()
clf = RandomForestClassifier()
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [73]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[591   1]
 [  0 108]]
[1mMetrics[0m
ACC: 0.999
BACC: 0.999
F1: 0.995
AUROC: 1.0
AP: 1.0
MCC: 0.995
Precision: 0.991
Recall: 1.0


In [74]:
RF_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [75]:
outF = open(fileout, "a")
outF.write("Random_Forest, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, RF_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, RF_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, RF_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.6 Linear SVM

In [76]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [77]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [78]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[507   3]
 [  1 189]]
[1mMetrics[0m
ACC: 0.994
BACC: 0.994
F1: 0.99
AUROC: 0.999
AP: 0.998
MCC: 0.986
Precision: 0.984
Recall: 0.995


In [79]:
LinSVM_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [80]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [81]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [82]:
LinSVM_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [83]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'linear', probability=True)
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [84]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [85]:
LinSVM_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [86]:
outF = open(fileout, "a")
outF.write("Linear_SVM, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, LinSVM_pbno32_devset_res)))
outF.write('\n')
outF.close()

### 3.7 RBF SVM

In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [88]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_cdcl2_train)
X_test_sds = scaler.transform(X_cdcl2_test)
y_train = y_cdcl2_train
y_test = y_cdcl2_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [89]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[506   4]
 [  0 190]]
[1mMetrics[0m
ACC: 0.994
BACC: 0.996
F1: 0.99
AUROC: 1.0
AP: 0.999
MCC: 0.986
Precision: 0.979
Recall: 1.0


In [90]:
RBFSVM_cdcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [91]:
## Pbcl2 Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_cdcl2_train)
X_train_sds = scaler.transform(X_pbcl2_train)
X_test_sds = scaler.transform(X_pbcl2_test)
y_train = y_pbcl2_train
y_test = y_pbcl2_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [92]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  1 107]]
[1mMetrics[0m
ACC: 0.999
BACC: 0.995
F1: 0.995
AUROC: 1.0
AP: 1.0
MCC: 0.995
Precision: 1.0
Recall: 0.991


In [93]:
RBFSVM_pbcl2_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [94]:
## Learn Classifier
scaler = StandardScaler()
clf = SVC(kernel = 'rbf', probability=True)
scaler.fit(X_pbno32_train)
X_train_sds = scaler.transform(X_pbno32_train)
X_test_sds = scaler.transform(X_pbno32_test)
y_train = y_pbno32_train
y_test = y_pbno32_test
clf.fit(X_train_sds, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [95]:
### Test within batch
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, f1_score 
from sklearn.metrics import roc_auc_score, precision_score, recall_score, matthews_corrcoef, average_precision_score

yp_test = clf.predict(X_test_sds)
ys_test = clf.predict_proba(X_test_sds)
ys_test = ys_test[:,1]

print('\033[1m' + 'Confusion Matrix' + '\033[0m')
print(confusion_matrix(y_test, yp_test))

print('\033[1m' + 'Metrics' + '\033[0m')
print('ACC: {}'.format(round(accuracy_score(y_test, yp_test), 3)))
print('BACC: {}'.format(round(balanced_accuracy_score(y_test, yp_test), 3)))
print('F1: {}'.format(round(f1_score(y_test, yp_test),3)))
print('AUROC: {}'.format(round(roc_auc_score(y_test, ys_test),3)))
print('AP: {}'.format(round(average_precision_score(y_test, ys_test),3)))
print('MCC: {}'.format(round(matthews_corrcoef(y_test, yp_test),3)))
print('Precision: {}'.format(round(precision_score(y_test, yp_test),3)))
print('Recall: {}'.format(round(recall_score(y_test, yp_test),3)))

[1mConfusion Matrix[0m
[[592   0]
 [  0 108]]
[1mMetrics[0m
ACC: 1.0
BACC: 1.0
F1: 1.0
AUROC: 1.0
AP: 1.0
MCC: 1.0
Precision: 1.0
Recall: 1.0


In [96]:
RBFSVM_pbno32_devset_res = [round(accuracy_score(y_test, yp_test), 3), round(balanced_accuracy_score(y_test, yp_test), 3),
                 round(f1_score(y_test, yp_test),3), round(roc_auc_score(y_test, ys_test),3),
                 round(average_precision_score(y_test, ys_test),3), round(matthews_corrcoef(y_test, yp_test),3),
                 round(precision_score(y_test, yp_test),3), round(recall_score(y_test, yp_test),3)]

In [97]:
outF = open(fileout, "a")
outF.write("RBF_SVM, ")
outF.write("ACC, BACC, F1, AUROC, Average_Precision, MCC, Precision, Recall\n")
outF.write('Cdcl2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_cdcl2_devset_res)))
outF.write('\n')
outF.write('Pbcl2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_pbcl2_devset_res)))
outF.write('\n')
outF.write('Pb(No3)2 DevSet, ')
outF.write(', '.join(map(str, RBFSVM_pbno32_devset_res)))
outF.write('\n')
outF.close()