categorical -> one hot

numerical -> [0,1]

pubchem -> one hot

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# carico i dati
db = pd.read_csv('data/lc_db_processed.csv').drop(columns = 'Unnamed: 0')

# trasformo i pubchem in dataset
pub = pd.DataFrame(pd.DataFrame(db['pubchem2d'].values).apply(lambda x: x.str.replace('', ' ').str.strip().str.split(' '),
                                                        axis = 1)[0].to_list(),
                   columns = ['pub'+ str(i) for i in range(1,882)])

# tolgo dal dataset info inutili
db.drop(columns = ['test_cas', 'smiles', 'pubchem2d'], inplace = True)

# unisco al dataset i pub
db = pd.concat([db,pub], axis = 1)

# one hot encoding di categoriche
categorical = ['obs_duration_mean', 'conc1_type', 'exposure_type', 'control_type', 'media_type',
               'application_freq_unit', 'species', 'class', 'tax_order', 'family', 'genus',
#               aggiunte perchè secondo me sono categoriche
              'alone_atom_number', 'doubleBond', 'tripleBond', 'ring_number', 'oh_count']

ohe = OneHotEncoder(sparse = False)
ohe.fit(db[categorical])
cat_df = pd.DataFrame(ohe.transform(db[categorical]), columns = ohe.get_feature_names(categorical))
# unisco al dataset (senza variabili categoriche) le categoriche-one-hot
finaldb = pd.concat([db.drop(columns =categorical), cat_df], axis = 1)
finaldb

#trasformo in variabile dicotomica la target 
finaldb['conc1_mean'] = np.where(finaldb['conc1_mean'].values > 1, 0, 1)

# mi preparo
X = finaldb.drop(columns = 'conc1_mean')
y = finaldb['conc1_mean'].values

In [26]:
# splitto
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify = y, random_state = 42)

In [27]:
# riscalaggio numeriche solo su training set
numerical = ['atom_number', 'bonds_number','Mol', 'MorganDensity', 'LogP']

minmax = MinMaxScaler()
minmax.fit(X_train[numerical])
new_train = X_train.copy()
new_train.loc[:, numerical] = minmax.transform(X_train[numerical])

In [30]:
# applico il modello
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(new_train, y_train)
y_pred = knn.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))
print('Specificity:', tn/(tn+fp))

Accuracy:  0.8900677200902934
Recall:  0.8623361144219309
F1:  0.8559597752144336
Specificity: 0.9069767441860465


In [46]:
from sklearn.model_selection import KFold

numerical = ['atom_number', 'bonds_number','Mol', 'MorganDensity', 'LogP']

kf = KFold(n_splits=5, shuffle=True, random_state = 5645)
accs = []
sens = []
specs = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    minmax = MinMaxScaler()
    minmax.fit(X_train[numerical])
    new_train = X_train.copy()
    new_train.loc[:, numerical] = minmax.transform(X_train[numerical])
    
    knn = KNeighborsClassifier(n_neighbors = 1, n_jobs = -1)
    knn.fit(new_train, y_train)
    y_pred = knn.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    accs.append(accuracy_score(y_test, y_pred))
    sens.append(recall_score(y_test, y_pred))
    specs.append(tn/(tn+fp))
    
    
print(np.mean(accs))
print(np.mean(sens))
print(np.mean(specs))

0.904380676294321
0.8653861052689606
0.9281564396781465


In [48]:
from scipy.stats import sem

print(sem(accs))
print(sem(sens))
print(sem(specs))

0.0009362106719225122
0.001983691300450497
0.0012885390904627343
