In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
# vedeere DistanceMetric sklearn per le misure di distanza
# vedere il parametro metric, che permette di definire una matrice di distanze

db = pd.read_csv('dataset/db_modelli.csv').drop(columns = 'Unnamed: 0')
# codifica
db['conc1_mean'] = np.where(db['conc1_mean'].values > 1, 1, 0)

db = db.drop(columns = 'test_cas')
print(db.info())

# le variabili da fare l'encoding ordinale
lst = list()
for col in db.columns:
    if db[col].dtypes == 'object':
        lst.append(col)

print(lst)

# Ordinal Encoding
encoder = OrdinalEncoder(dtype = int)

encoder.fit(db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])

db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']] = encoder.transform(
    db[['species', 'conc1_type', 'exposure_type', 'class', 'tax_order', 'family', 'genus']])+1

db.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23332 entries, 0 to 23331
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            23332 non-null  object 
 1   conc1_type         23332 non-null  object 
 2   exposure_type      23332 non-null  object 
 3   obs_duration_mean  23332 non-null  float64
 4   conc1_mean         23332 non-null  int32  
 5   atom_number        23332 non-null  float64
 6   alone_atom_number  23332 non-null  int64  
 7   bonds_number       23332 non-null  float64
 8   doubleBond         23332 non-null  int64  
 9   tripleBond         23332 non-null  int64  
 10  ring_number        23332 non-null  float64
 11  Mol                23332 non-null  float64
 12  MorganDensity      23332 non-null  float64
 13  LogP               23332 non-null  float64
 14  class              23332 non-null  object 
 15  tax_order          23332 non-null  object 
 16  family             233

Unnamed: 0,species,conc1_type,exposure_type,obs_duration_mean,conc1_mean,atom_number,alone_atom_number,bonds_number,doubleBond,tripleBond,ring_number,Mol,MorganDensity,LogP,class,tax_order,family,genus,oh_count
0,379,3,3,48.0,1,0.317908,2,0.488106,1,0,1.0,0.535725,1.3,2.2482,1,10,35,80,0
1,379,3,3,96.0,1,0.317908,2,0.488106,1,0,1.0,0.535725,1.3,2.2482,1,10,35,80,0
2,379,3,9,96.0,1,0.317908,2,0.488106,1,0,1.0,0.510371,1.3,1.177,1,10,35,80,0
3,218,3,1,48.0,1,0.317908,2,0.488106,1,0,1.0,0.510371,1.3,1.177,1,10,35,134,0
4,237,1,9,24.0,1,0.317908,2,0.488106,1,0,1.0,0.510371,1.3,1.177,1,6,2,182,0


# KNN Semplice
#### Con Ordinal Encoder

In [2]:
y = db['conc1_mean'].values
X = db.drop(columns = 'conc1_mean').values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(X_train, y_train.ravel())
y_pred = neigh.predict(X_test)


In [15]:
accuracy_score(y_pred, y_test)

0.7322077922077922

# KNN con matrice distanza

Per prima cosa occorre costruire la matrice di distanza. In mente si ha l'idea di costruire 4 distanze:

1) Euclidea per variabili continue

2) Hamming per variabili categoriche

3) Hamming basato su pubchem2d

4) Tanimoto basato su Fingerprint

Per le prime due distanze si considera il lavoro fatto in precedenza: si dividono le variabili tra categoriche e non categoriche, si fa lo split tra train e test e si guarda alla lunghezza del train, poi si riuniscono i dati train con i dati test. Questo perchè si rifarà alla fine lo split della matrice di distanze in basa alla lunghezza del train. Tramite **scipy** si usa **squareform** con metrica *euclidea* per le variabili continue e metrica *hamming* per le variabili categoriche trasformate in _Ordinali numeriche_. Si tiene conto anche dell'*alpha*, ossia un parametro che permette di diminuire o aumentare l'importanza della distanza. Una volta ottenuta la matrice di distanza e anche suddivisa tra train e test, ponendo all'interno del KNeighborsClassifier il parametro *metric = 'precomputed'* è possibile fittare il modello sulla matrice train e vedere le performance sulla matrice di distanze test.

Per prima cosa, quindi, costruiamo la matrice di distanze.

In [6]:
# Matrice di distanze 1+2 (eulcidea + hamming per categoriche)
import numpy as np
from scipy.spatial.distance import hamming
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

def new_distance_matrix(X, len_X_train, cat_features = [], num_features = [], alpha = 1):
    ''' inputs: matrix X [num_samples, num_features], the list of the categorical features, the list of the numerical features, weight alpha
        output: distance matrix
    '''

    # Training
    X_cat = X[cat_features]
    X_num = X[num_features]
    print('Inizio Hamming per variabili categoriche')
    dist_matr = alpha * squareform(pdist(X_cat, metric = "hamming"))
    
    print('Fine Hamming... inizio Euclidean per variabili continue')
    dist_matr += squareform(pdist(X_num, metric = "euclidean"))
    
    print('Fine')
    
    dist_matr_train = dist_matr[:len_X_train,:len_X_train]
    dist_matr_test = dist_matr[len_X_train:,:len_X_train]

    return dist_matr_train, dist_matr_test

In [26]:
print(db.columns)
print(db.shape)

Index(['species', 'conc1_type', 'exposure_type', 'obs_duration_mean',
       'conc1_mean', 'atom_number', 'alone_atom_number', 'bonds_number',
       'doubleBond', 'tripleBond', 'ring_number', 'Mol', 'MorganDensity',
       'LogP', 'class', 'tax_order', 'family', 'genus', 'oh_count'],
      dtype='object')
(23332, 19)


In [3]:
# oltre a queste c'è conc1_mean ossia la variabile obiettivo
categorical = [
'ring_number',
"exposure_type", 
"conc1_type","species",
'tripleBond',
'obs_duration_mean',
'doubleBond',
'alone_atom_number',
'class', 'tax_order', 'family', 'genus', 'oh_count'
 ]

non_categorical =[
 'atom_number',
 'bonds_number',
 'Mol',
 'MorganDensity',
 'LogP']

len(categorical) + len(non_categorical)

18

In [4]:
# Divido il dataset completo tra dati e target
X = db.drop(columns = 'conc1_mean')
y = db['conc1_mean'].values

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# ricongiungo train con test
X_try = X_train.append(X_test)

# tengo traccia della lunghezza del train set
len_X_train = len(X_train) 
print(len_X_train)
# >>> 15632

15632


In [10]:
# X_train_new, X_test_new = new_distance_matrix(X_try, len_X_train, categorical,non_categorical, alpha = 1)

In [None]:
# Matrice della distanza numero 2: Hamming su variabili categoriche
hamming_cat = squareform(pdist(X_try[categorical], metric = "hamming"))
hamming_cat

In [7]:
# Matrice della distanza numero 1: Euclidea su variabili continue
euc = squareform(pdist(X_try[non_categorical], metric = "euclidean"))
euc

array([[ 0.        ,  5.77211186,  4.56015449, ...,  1.34194013,
        15.03736938,  6.51247651],
       [ 5.77211186,  0.        ,  1.30280288, ...,  7.0992439 ,
         9.31067809,  0.88434375],
       [ 4.56015449,  1.30280288,  0.        , ...,  5.87023513,
        10.479009  ,  2.15486325],
       ...,
       [ 1.34194013,  7.0992439 ,  5.87023513, ...,  0.        ,
        16.34179219,  7.84441797],
       [15.03736938,  9.31067809, 10.479009  , ..., 16.34179219,
         0.        ,  8.6996705 ],
       [ 6.51247651,  0.88434375,  2.15486325, ...,  7.84441797,
         8.6996705 ,  0.        ]])

In [8]:
euc.shape

(23332, 23332)

In [9]:
euc_db = pd.DataFrame(euc)
euc_db

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23322,23323,23324,23325,23326,23327,23328,23329,23330,23331
0,0.000000,5.772112,4.560154,3.868839,5.050017,5.051815,1.584681,1.943110,1.111838,1.180418,...,12.610934,1.461687,3.304772,0.611675,1.042476,2.829352,1.544289,1.341940,15.037369,6.512477
1,5.772112,0.000000,1.302803,1.912996,0.735711,0.733783,7.353579,3.845464,4.748032,4.628525,...,6.869061,4.318475,2.501444,6.271750,6.738620,8.586771,4.260352,7.099244,9.310678,0.884344
2,4.560154,1.302803,0.000000,0.726216,0.633033,0.643671,6.135221,2.677859,3.589771,3.443019,...,8.059977,3.102620,1.267974,5.090484,5.494730,7.359495,3.095772,5.870235,10.479009,2.154863
3,3.868839,1.912996,0.726216,0.000000,1.193438,1.197417,5.449440,1.965266,2.877159,2.740783,...,8.747225,2.411600,0.602435,4.385763,4.826741,6.681333,2.383075,5.191243,11.177508,2.709105
4,5.050017,0.735711,0.633033,1.193438,0.000000,0.057467,6.629783,3.126374,4.030342,3.906717,...,7.588090,3.598167,1.789987,5.553828,6.014485,7.860392,3.542538,6.374385,10.020947,1.540924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23327,2.829352,8.586771,7.359495,6.681333,7.860392,7.861352,1.253547,4.755764,3.888827,3.971487,...,15.416444,4.278815,6.113401,2.396881,1.895830,0.000000,4.351877,1.502294,17.829909,9.322550
23328,1.544289,4.260352,3.095772,2.383075,3.542538,3.543016,3.113666,0.418958,0.505136,0.402646,...,11.120518,0.393788,1.860618,2.016017,2.563682,4.351877,0.000000,2.878837,13.557027,4.979145
23329,1.341940,7.099244,5.870235,5.191243,6.374385,6.375710,0.305786,3.277488,2.429155,2.502741,...,13.925837,2.786129,4.620009,1.007256,0.446962,1.502294,2.878837,0.000000,16.341792,7.844418
23330,15.037369,9.310678,10.479009,11.177508,10.020947,10.021384,16.611248,13.139341,14.047832,13.914466,...,2.491646,13.578000,11.734647,15.562096,15.954827,17.829909,13.557027,16.341792,0.000000,8.699670


In [None]:
euc_db.to_csv('distanza_euclidea.csv')