# Data initialization

## Install necessary libraries

In [9]:
!pip install deepchem==2.6.1
!pip install rdkit-pypi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import all libraries

In [23]:
import deepchem as dc
import scipy
import numpy as np 
import pandas as pd
import seaborn as sn 
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split, ParameterGrid
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, PandasTools, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as MolDCal
import matplotlib.colors
import matplotlib.pyplot as plt

In [11]:
a = np.array([34,36,38,47])
print(np.var(a))

24.6875


## Load dataset

In [12]:
# load your data
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving AID_2302_datatable_all.csv to AID_2302_datatable_all.csv
User uploaded file "AID_2302_datatable_all.csv" with length 1417796 bytes


In [13]:
df = pd.read_csv('AID_2302_datatable_all.csv')
df.head(10)

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,PCT_INHIB_DD2
0,RESULT_TYPE,,,,,,,,FLOAT
1,RESULT_DESCR,,,,,,,,Percent inhibition of P. falciparum Dd2 growth.
2,RESULT_UNIT,,,,,,,,PERCENT
3,RESULT_ATTR_CONC_MICROMOL,,,,,,,,2
4,1,85857918.0,11957475.0,C1CN(CCN1CC(C(C2=CC=CC=C2)C3=CC=CC=C3)O)C4=CC(...,Inactive,,,,4
5,2,85857919.0,6618861.0,C1CN(CC=C1C2=CNC3=CC=CC=C32)CC4=CC=CC=C4,Active,,,,80
6,3,85857920.0,21924871.0,C1=CC(=CC=C1C2=CC3=C(N2)C=C(C=C3)C(=N)N)C(=N)N.Cl,Active,,,,99
7,4,85857921.0,1365597.0,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,Active,,,,96
8,5,85857922.0,10115148.0,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,Active,,,,99
9,6,85857923.0,2540886.0,C1=CC=NC(=C1)CCNC2=C3C=CC(=CC3=NC=C2)Cl,Active,,,,72


# Data preprocessing

## Clean and wash dataset

In [14]:
cleaned_df = df.loc[4:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']].reset_index(drop=True)
cleaned_df.head()

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,C1CN(CCN1CC(C(C2=CC=CC=C2)C3=CC=CC=C3)O)C4=CC(...,Inactive
1,C1CN(CC=C1C2=CNC3=CC=CC=C32)CC4=CC=CC=C4,Active
2,C1=CC(=CC=C1C2=CC3=C(N2)C=C(C=C3)C(=N)N)C(=N)N.Cl,Active
3,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,Active
4,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,Active


In [15]:
cleaned_df.drop_duplicates('PUBCHEM_EXT_DATASOURCE_SMILES', inplace=True)
cleaned_df.dropna(inplace=True)
cleaned_df.shape

(13456, 2)

In [16]:
smiles_lst = cleaned_df['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist()
activity = cleaned_df['PUBCHEM_ACTIVITY_OUTCOME'].tolist()

cleaned_smiles = [sorted(smile.split('.'), key=lambda e: len(e), reverse=True)[0] for smile in smiles_lst] # washing SMILES
labels = [1 if a=='Active' else 0 for a in activity]  # transfroms activity to binary

In [17]:
cleaned_df['PUBCHEM_EXT_DATASOURCE_SMILES'] = cleaned_smiles
cleaned_df['PUBCHEM_ACTIVITY_OUTCOME'] = labels
cleaned_df.head()

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,C1CN(CCN1CC(C(C2=CC=CC=C2)C3=CC=CC=C3)O)C4=CC(...,0
1,C1CN(CC=C1C2=CNC3=CC=CC=C32)CC4=CC=CC=C4,1
2,C1=CC(=CC=C1C2=CC3=C(N2)C=C(C=C3)C(=N)N)C(=N)N,1
3,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,1
4,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,1


## Generate QSAR dataset

In [18]:
PandasTools.AddMoleculeColumnToFrame(cleaned_df,'PUBCHEM_EXT_DATASOURCE_SMILES','molecule')
mol_list = cleaned_df['molecule'].tolist()

descriptors_list = [x[0] for x in Descriptors._descList]
calc = MolDCal(descriptors_list)

desc = [[]]*len(mol_list)
for i in range(len(mol_list)):
  desc[i] = calc.CalcDescriptors(mol_list[i])

df_desc = pd.DataFrame(desc, columns = descriptors_list)

In [19]:
df1 = pd.concat(
    [
        cleaned_df,
     df_desc
    ], axis = 1, join='inner'
)
df1.head()

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,molecule,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1CN(CCN1CC(C(C2=CC=CC=C2)C3=CC=CC=C3)O)C4=CC(...,0,<rdkit.Chem.rdchem.Mol object at 0x7f397465e3f0>,11.21467,-0.46063,11.21467,0.024667,0.644772,406.957,379.741,...,0,0,0,0,0,0,0,0,0,0
1,C1CN(CC=C1C2=CNC3=CC=CC=C32)CC4=CC=CC=C4,1,<rdkit.Chem.rdchem.Mol object at 0x7f397465e450>,3.382067,1.031483,3.382067,1.031483,0.753755,288.394,268.234,...,0,0,0,0,0,0,0,0,0,0
2,C1=CC(=CC=C1C2=CC3=C(N2)C=C(C=C3)C(=N)N)C(=N)N,1,<rdkit.Chem.rdchem.Mol object at 0x7f397465e510>,7.477208,0.054501,7.477208,0.054501,0.373396,277.331,262.211,...,0,0,0,0,0,0,0,0,0,0
3,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,1,<rdkit.Chem.rdchem.Mol object at 0x7f397465e630>,5.801419,0.736943,5.801419,0.736943,0.281312,416.488,396.328,...,0,0,0,0,0,0,0,0,0,0
4,C1=CC(=CC=C1C2=NC3=C(N2)C=C(C=C3)C4=CC5=C(C=C4...,1,<rdkit.Chem.rdchem.Mol object at 0x7f397465e690>,6.007486,0.708857,6.007486,0.708857,0.286709,455.348,439.22,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# remove any rows with NaN if any 
df1 = df1.dropna(axis=0)

# check if dataframe contains any NaN value
print("There are {} Nan values in the dataset.".format(df1.isna().sum().sum()))

There are 0 Nan values in the dataset.


In [21]:
df1.shape

(13379, 211)

In [None]:
desc_val = df1.iloc[:,3:].to_numpy()
desc_val.shape

In [None]:
label_desc = df1['PUBCHEM_ACTIVITY_OUTCOME'].to_numpy()

In [None]:
tasks = ['label']
data_desc = dc.data.DiskDataset.from_numpy(X=desc_val, y=label_desc, ids=df1['PUBCHEM_EXT_DATASOURCE_SMILES'].to_numpy(), tasks = tasks)

In [51]:
data_desc

<DiskDataset X.shape: (13379, 208), y.shape: (13379,), w.shape: (13379,), task_names: ['label']>

In [52]:
splitter = dc.splits.RandomSplitter()
data_fold_desc = splitter.k_fold_split(data_desc, k=10)

## Generate ECFP dataset

In [53]:
morgan_fing_list = np.array([AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=1024) for m in cleaned_df['molecule']])

In [54]:
tasks = ['label']
data_ecfp = dc.data.DiskDataset.from_numpy(X=morgan_fing_list, y=labels, ids=np.array(cleaned_smiles), tasks = tasks)

In [55]:
data_ecfp

<DiskDataset X.shape: (13456, 1024), y.shape: (13456,), w.shape: (13456,), task_names: ['label']>

In [56]:
data_fold_ecfp = splitter.k_fold_split(data_ecfp, k=10)

# Random Forest model

## Hyperparameter optimization

In [None]:
x = morgan_fing_list
y = cleaned_df['PUBCHEM_ACTIVITY_OUTCOME']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
param = ParameterGrid(
    {'n_estimators': np.arange(50, 201, 50),
         'criterion': ['gini', 'entropy'],
         'max_depth': np.arange(5, 21, 5)
         }
)
acc = 0
for par in param:
  crit, maxd, nest = par['criterion'], par['max_depth'], par['n_estimators']
  rf1 = RandomForestClassifier(criterion=crit, max_depth=maxd, n_estimators=nest, random_state=0)
  rf1.fit(X_train,y_train)
  y_pred1 = rf1.predict(X_test)
  acc1 = metrics.accuracy_score(y_test, y_pred1)
  if acc1>acc:
    acc=acc1
    par1=par
print('The best combination is ', par1)
print("The accuracy is :",acc)

The best combination is  {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 50}
The accuracy is : 0.7184249628528975


In [None]:
x = df1.iloc[:,3:]
y = df1['PUBCHEM_ACTIVITY_OUTCOME']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
acc = 0
for par in param:
  crit, maxd, nest = par['criterion'], par['max_depth'], par['n_estimators']
  rf1 = RandomForestClassifier(criterion=crit, max_depth=maxd, n_estimators=nest, random_state=0)
  rf1.fit(X_train,y_train)
  y_pred1 = rf1.predict(X_test)
  acc1 = metrics.accuracy_score(y_test, y_pred1)
  if acc1>acc:
    acc=acc1
    par1=par
print('The best combination is ', par1)
print("The accuracy is :",acc)

The best combination is  {'criterion': 'entropy', 'max_depth': 15, 'n_estimators': 200}
The accuracy is : 0.6046337817638266


## Model using QSAR descriptors

In [62]:
mcc_qsar = []
acc_qsar = []
bal_acc_qsar = []
prec_qsar = []
rec_qsar = []
roc_auc_qsar = []

for i in range(len(data_fold_desc)):
  rf_qsar = RandomForestClassifier(criterion = 'entropy', max_depth = 15, n_estimators = 200)
  rf_qsar.fit(data_fold_desc[i][0].X, data_fold_desc[i][0].y)
  y_pred = rf_qsar.predict(data_fold_desc[i][1].X)
  prob = rf_qsar.predict_proba(data_fold_desc[i][1].X)
  mcc_qsar.append(dc.metrics.matthews_corrcoef(data_fold_desc[i][1].y, y_pred))
  acc_qsar.append(dc.metrics.accuracy_score(data_fold_desc[i][1].y, y_pred))
  bal_acc_qsar.append(dc.metrics.balanced_accuracy_score(data_fold_desc[i][1].y, y_pred))
  prec_qsar.append(dc.metrics.precision_score(data_fold_desc[i][1].y, y_pred))
  rec_qsar.append(dc.metrics.recall_score(data_fold_desc[i][1].y, y_pred))
  roc_auc_qsar.append(dc.metrics.roc_auc_score(data_fold_desc[i][1].y, prob[:,1]))

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


### Results

In [64]:
print('Accuracy scores = ',acc_qsar)
print('Balanced Accuracy scores = ',bal_acc_qsar)
print('Precision scores = ',prec_qsar)
print('Recall scores = ',rec_qsar)
print('ROC AUC scores = ',roc_auc_qsar)
print('MCC scores = ',mcc_qsar)

Accuracy scores =  [0.6080777860882572, 0.6068759342301944, 0.6315396113602392, 0.6128550074738416, 0.5881913303437967, 0.6165919282511211, 0.594170403587444, 0.609118086696562, 0.5904334828101644, 0.6038863976083707]
Balanced Accuracy scores =  [0.554129742453614, 0.5423002481849163, 0.5733408690990875, 0.5586522972101591, 0.5350599907706507, 0.5508476941973425, 0.5410670410670411, 0.5608278894960007, 0.5353343954910484, 0.5526534938299644]
Precision scores =  [0.6191780821917808, 0.617338003502627, 0.6455223880597015, 0.6262719703977798, 0.6100278551532033, 0.6284444444444445, 0.6046511627906976, 0.6087344028520499, 0.6066907775768535, 0.611764705882353]
Recall scores =  [0.8636942675159236, 0.8879093198992444, 0.8596273291925466, 0.8558786346396966, 0.833756345177665, 0.8815461346633416, 0.87001287001287, 0.8904823989569752, 0.8558673469387755, 0.87001287001287]
ROC AUC scores =  [0.6056090187390382, 0.5925345421543933, 0.6179972731404332, 0.6091564839360539, 0.589730041532072, 0.62

In [66]:
results_qsar = pd.DataFrame({'Metric': ['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'ROC AUC', 'MCC'], 
                        'Mean Value':[np.array(acc_qsar).mean(), np.array(bal_acc_qsar).mean(), np.array(prec_qsar).mean(),
                                      np.array(rec_qsar).mean(), np.array(roc_auc_qsar).mean(), np.array(mcc_qsar).mean()],
                        'Max Variation': [np.array(acc_qsar).max()-np.array(acc_qsar).mean(),
                                      np.array(bal_acc_qsar).max()-np.array(bal_acc_qsar).mean(),
                                      np.array(prec_qsar).max()-np.array(prec_qsar).mean(),
                                      np.array(rec_qsar).max()-np.array(rec_qsar).mean(),
                                      np.array(roc_auc_qsar).max()-np.array(roc_auc_qsar).mean(),
                                      np.array(mcc_qsar).max()-np.array(mcc_qsar).mean()],
                        'Min Variation': [np.array(acc_qsar).min()-np.array(acc_qsar).mean(),
                                      np.array(bal_acc_qsar).min()-np.array(bal_acc_qsar).mean(),
                                      np.array(prec_qsar).min()-np.array(prec_qsar).mean(),
                                      np.array(rec_qsar).min()-np.array(rec_qsar).mean(),
                                      np.array(roc_auc_qsar).min()-np.array(roc_auc_qsar).mean(),
                                      np.array(mcc_qsar).min()-np.array(mcc_qsar).mean()]})

In [67]:
results_qsar

Unnamed: 0,Metric,Mean Value,Max Variation,Min Variation
0,Accuracy,0.606174,0.025366,-0.017983
1,Balanced Accuracy,0.550421,0.02292,-0.015361
2,Precision,0.617862,0.02766,-0.013211
3,Recall,0.866879,0.023604,-0.033122
4,ROC AUC,0.609231,0.031216,-0.020053
5,MCC,0.13074,0.049188,-0.043672


## Model using morgan fingerprints

In [68]:
mcc_ecfp = []
acc_ecfp = []
bal_acc_ecfp = []
prec_ecfp = []
rec_ecfp = []
roc_auc_ecfp = []

for i in range(len(data_fold_ecfp)):
  rf_ecfp = RandomForestClassifier(criterion = 'gini', max_depth = 20, n_estimators = 50)
  rf_ecfp.fit(data_fold_ecfp[i][0].X, data_fold_ecfp[i][0].y)
  y_pred = rf_ecfp.predict(data_fold_ecfp[i][1].X)
  prob = rf_ecfp.predict_proba(data_fold_ecfp[i][1].X)
  mcc_ecfp.append(dc.metrics.matthews_corrcoef(data_fold_ecfp[i][1].y, y_pred))
  acc_ecfp.append(dc.metrics.accuracy_score(data_fold_ecfp[i][1].y, y_pred))
  bal_acc_ecfp.append(dc.metrics.balanced_accuracy_score(data_fold_ecfp[i][1].y, y_pred))
  prec_ecfp.append(dc.metrics.precision_score(data_fold_ecfp[i][1].y, y_pred))
  rec_ecfp.append(dc.metrics.recall_score(data_fold_ecfp[i][1].y, y_pred))
  roc_auc_ecfp.append(dc.metrics.roc_auc_score(data_fold_ecfp[i][1].y, prob[:,1]))

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


### Results

In [69]:
print('Accuracy scores = ',acc_ecfp)
print('Balanced Accuracy scores = ',bal_acc_ecfp)
print('Precision scores = ',prec_ecfp)
print('Recall scores = ',rec_ecfp)
print('ROC AUC scores = ',roc_auc_ecfp)
print('MCC scores = ',mcc_ecfp)

Accuracy scores =  [0.7241635687732342, 0.7122676579925651, 0.7100371747211895, 0.7063197026022305, 0.7072808320950966, 0.7206537890044576, 0.7392273402674592, 0.7228826151560178, 0.7206537890044576, 0.7317979197622585]
Balanced Accuracy scores =  [0.6857140575152779, 0.6680439376291911, 0.6828533214845135, 0.6709944350789422, 0.6638215054941043, 0.6773526322894478, 0.699972511424939, 0.6853860779069556, 0.6754021127019765, 0.6933020580077389]
Precision scores =  [0.7079207920792079, 0.7059980334316618, 0.6812749003984063, 0.6883629191321499, 0.6926782273603083, 0.7055393586005831, 0.7245508982035929, 0.705242334322453, 0.7109375, 0.7157057654075547]
Recall scores =  [0.9039190897597977, 0.890818858560794, 0.9071618037135278, 0.8983268983268984, 0.9055415617128464, 0.9086357947434293, 0.9063670411985019, 0.9048223350253807, 0.900990099009901, 0.9056603773584906]
ROC AUC scores =  [0.7877635128042464, 0.7884051432438529, 0.7942445704129584, 0.7787105515978755, 0.7878358522250211, 0.8046

In [70]:
results_ecfp = pd.DataFrame({'Metric': ['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'ROC AUC', 'MCC'], 
                        'Mean Value':[np.array(acc_ecfp).mean(), np.array(bal_acc_ecfp).mean(), np.array(prec_ecfp).mean(),
                                      np.array(rec_ecfp).mean(), np.array(roc_auc_ecfp).mean(), np.array(mcc_ecfp).mean()],
                        'Max Variation': [np.array(acc_ecfp).max()-np.array(acc_ecfp).mean(),
                                      np.array(bal_acc_ecfp).max()-np.array(bal_acc_ecfp).mean(),
                                      np.array(prec_ecfp).max()-np.array(prec_ecfp).mean(),
                                      np.array(rec_ecfp).max()-np.array(rec_ecfp).mean(),
                                      np.array(roc_auc_ecfp).max()-np.array(roc_auc_ecfp).mean(),
                                      np.array(mcc_ecfp).max()-np.array(mcc_ecfp).mean()],
                        'Min Variation': [np.array(acc_ecfp).min()-np.array(acc_ecfp).mean(),
                                      np.array(bal_acc_ecfp).min()-np.array(bal_acc_ecfp).mean(),
                                      np.array(prec_ecfp).min()-np.array(prec_ecfp).mean(),
                                      np.array(rec_ecfp).min()-np.array(rec_ecfp).mean(),
                                      np.array(roc_auc_ecfp).min()-np.array(roc_auc_ecfp).mean(),
                                      np.array(mcc_ecfp).min()-np.array(mcc_ecfp).mean()]})

In [71]:
results_ecfp

Unnamed: 0,Metric,Mean Value,Max Variation,Min Variation
0,Accuracy,0.719528,0.019699,-0.013209
1,Balanced Accuracy,0.680284,0.019688,-0.016463
2,Precision,0.703821,0.02073,-0.022546
3,Recall,0.903224,0.005411,-0.012406
4,ROC AUC,0.796629,0.017701,-0.017919
5,MCC,0.41227,0.037823,-0.028724


# Logistic regression model with ECFP

## Model

In [72]:
mcc_lr = []
acc_lr = []
bal_acc_lr = []
prec_lr = []
rec_lr = []
roc_auc_lr = []

for i in range(len(data_fold_ecfp)):
  lr = LogisticRegression()
  lr.fit(data_fold_ecfp[i][0].X, data_fold_ecfp[i][0].y)
  y_pred = lr.predict(data_fold_ecfp[i][1].X)
  prob = lr.predict_proba(data_fold_ecfp[i][1].X)
  mcc_lr.append(dc.metrics.matthews_corrcoef(data_fold_ecfp[i][1].y, y_pred))
  acc_lr.append(dc.metrics.accuracy_score(data_fold_ecfp[i][1].y, y_pred))
  bal_acc_lr.append(dc.metrics.balanced_accuracy_score(data_fold_ecfp[i][1].y, y_pred))
  prec_lr.append(dc.metrics.precision_score(data_fold_ecfp[i][1].y, y_pred))
  rec_lr.append(dc.metrics.recall_score(data_fold_ecfp[i][1].y, y_pred))
  roc_auc_lr.append(dc.metrics.roc_auc_score(data_fold_ecfp[i][1].y, prob[:,1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
  

## Results

In [73]:
print('Accuracy scores = ',acc_lr)
print('Balanced Accuracy scores = ',bal_acc_lr)
print('Precision scores = ',prec_lr)
print('Recall scores = ',rec_lr)
print('ROC AUC scores = ',roc_auc_lr)
print('MCC scores = ',mcc_lr)

Accuracy scores =  [0.6840148698884758, 0.6795539033457249, 0.6795539033457249, 0.6951672862453532, 0.6924219910846954, 0.700594353640416, 0.712481426448737, 0.7057949479940565, 0.687221396731055, 0.6708766716196136]
Balanced Accuracy scores =  [0.6664529202627028, 0.657649953732903, 0.6653583594770361, 0.6807579712509291, 0.6713804621618662, 0.6809254255204746, 0.6939204434823443, 0.6877922420537452, 0.6664995031101624, 0.6553767307011836]
Precision scores =  [0.7163120567375887, 0.7172653534183082, 0.6884480746791132, 0.7197604790419162, 0.7178899082568807, 0.7302325581395349, 0.7423887587822015, 0.7284382284382285, 0.7257876312718786, 0.7130750605326877]
Recall scores =  [0.7661188369152971, 0.7679900744416873, 0.7824933687002652, 0.7734877734877735, 0.7884130982367759, 0.785982478097622, 0.7915106117353309, 0.7931472081218274, 0.7698019801980198, 0.740880503144654]
ROC AUC scores =  [0.7331121324284482, 0.7255037589138971, 0.7343429515230671, 0.7396767995359546, 0.7339922607965539,

In [74]:
results_lr = pd.DataFrame({'Metric': ['Accuracy', 'Balanced Accuracy', 'Precision', 'Recall', 'ROC AUC', 'MCC'], 
                        'Mean Value':[np.array(acc_lr).mean(), np.array(bal_acc_lr).mean(), np.array(prec_lr).mean(),
                                      np.array(rec_lr).mean(), np.array(roc_auc_lr).mean(), np.array(mcc_lr).mean()],
                        'Max Variation': [np.array(acc_lr).max()-np.array(acc_lr).mean(),
                                      np.array(bal_acc_lr).max()-np.array(bal_acc_lr).mean(),
                                      np.array(prec_lr).max()-np.array(prec_lr).mean(),
                                      np.array(rec_lr).max()-np.array(rec_lr).mean(),
                                      np.array(roc_auc_lr).max()-np.array(roc_auc_lr).mean(),
                                      np.array(mcc_lr).max()-np.array(mcc_lr).mean()],
                        'Min Variation': [np.array(acc_lr).min()-np.array(acc_lr).mean(),
                                      np.array(bal_acc_lr).min()-np.array(bal_acc_lr).mean(),
                                      np.array(prec_lr).min()-np.array(prec_lr).mean(),
                                      np.array(rec_lr).min()-np.array(rec_lr).mean(),
                                      np.array(roc_auc_lr).min()-np.array(roc_auc_lr).mean(),
                                      np.array(mcc_lr).min()-np.array(mcc_lr).mean()]})

In [75]:
results_lr

Unnamed: 0,Metric,Mean Value,Max Variation,Min Variation
0,Accuracy,0.690768,0.021713,-0.019891
1,Balanced Accuracy,0.672611,0.021309,-0.017235
2,Precision,0.71996,0.022429,-0.031512
3,Recall,0.775983,0.017165,-0.035102
4,ROC AUC,0.739332,0.02909,-0.024358
5,MCC,0.352698,0.042629,-0.038876
