In [2]:
#Ignorar warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np #operaciones matriciales y con vectores
import pandas as pd #tratamiento de datos
import matplotlib.pyplot as plt #gráficos
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate #método para evaluar varios particionamientos de C-V
from sklearn.model_selection import KFold

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.feature_selection import RFECV

from sklearn.preprocessing import StandardScaler
import math
import seaborn as sns

from scipy.stats import uniform

In [3]:
data = pd.read_csv('data_set_cleaned.csv', header = 0, names=None)
data.head()

Unnamed: 0,Case1_Control0,Edad,Genero,Cycle Time,Stance Percent,Stance Time,Step Length,Step Number,Step Time,Step Cadence,...,Step Timel,Step Cadencel,Stride Numberl,Stride Lengthl,Swing Percentl,Swing Timel,Distancel,Duration Timel,Speedl,Accelerationl
0,Positivo,79,M,0.803979,0.644524,1.327075,0.796338,2,0.335486,58.28075,...,0.312041,88.428185,1,0.185463,0.459888,0.624083,1.890646,1.357033,1.39322,1.026666
1,Positivo,79,M,0.826989,0.633677,1.326004,0.823259,2,0.352115,57.346196,...,0.429035,59.169468,1,1.198056,0.423096,0.85807,1.923441,2.028073,0.948408,0.46764
2,Positivo,79,M,0.789131,0.610778,1.219237,0.739892,2,0.359121,60.114133,...,0.405821,56.550877,1,1.059078,0.382493,0.811643,1.974094,2.121983,0.930306,0.438414
3,Positivo,68,M,1.193026,0.536687,1.263887,0.741654,2,0.476885,50.955888,...,0.435792,50.936203,1,1.085094,0.36996,0.871583,1.951583,2.355888,0.828385,0.351623
4,Positivo,68,M,1.045567,0.508093,1.061983,0.817604,2,0.446011,57.412502,...,0.358499,59.64247,1,1.189026,0.356362,0.716997,1.858186,2.011989,0.923557,0.459027


In [4]:
x = data.loc[: , ~data.columns.isin(['Case1_Control0'])]
x = pd.get_dummies(data=x)
x = x.values

y = data['Case1_Control0'].values

In [6]:
clf = LogisticRegression()

kf = KFold(n_splits=10, shuffle=True, random_state=1234)
acc_test_vec=[]
for indices_train, indices_test in kf.split(x):    
    #print("%s %s" % (x[indices_train], indices_test))        
    x_train = x[indices_train]
    x_test = x[indices_test]
    clf.fit(x_train,y[indices_train])
    y_pred = clf.predict(x_test)
    acc_test_vec.append(metrics.accuracy_score(y[indices_test], y_pred))  
acc_test_vec
print("Maxima exactitud=%0.5f" % (np.mean(acc_test_vec)))

Maxima exactitud=0.69298


In [17]:
param_grid = {
    'penalty' : ['l2', 'l1'],
    'C' : [0.001, 0.01, 0.1, 0.5, 1, 100, 1000]
}

GS_LR = GridSearchCV(LogisticRegression(), param_grid = param_grid, scoring = 'accuracy', cv = 10)

GS_LR.fit(x,y)

print('Mejores parametros:\n', GS_LR.best_params_)
print('Mejor Exactitud:\n', GS_LR.best_score_)

Mejores parametros:
 {'C': 0.5, 'penalty': 'l2'}
Mejor Exactitud:
 0.6813186813186813


In [36]:
param_grid = {
    'penalty' : ['l2', 'l1'],
    'C' : uniform(loc=0, scale=4)
}

RS_LR = RandomizedSearchCV(LogisticRegression(), param_distributions=param_grid, random_state=1234,
                           cv=10, scoring='accuracy', n_iter=20)

RS_LR.fit(x,y)

print('Mejores parametros:\n', RS_LR.best_params_)
print('Mejor Exactitud:\n', RS_LR.best_score_)



Mejores parametros:
 {'C': 0.7660778015155691, 'penalty': 'l2'}
Mejor Exactitud:
 0.6813186813186813


PCA

In [37]:
data_pca = pd.read_csv('data_set_pca.csv', header = 0, names=None)
x_pca = data_pca.values

In [40]:
RS_LR.fit(x_pca, y)

print('Mejores parametros:\n', RS_LR.best_params_)
print('Mejor Exactitud:\n', RS_LR.best_score_)

Mejores parametros:
 {'C': 1.1058570205723868, 'penalty': 'l1'}
Mejor Exactitud:
 0.6428571428571429


## RFE

In [6]:
rfecv = RFECV(estimator=LogisticRegression(penalty='l2', C=0.5), scoring='accuracy', cv=10, step=1)

rfecv.fit(x,y)

print("Caracteristicas seleccionadas:",data.columns[rfecv.support_])
print("Mejor Exactitud:", np.max(rfecv.grid_scores_))

Caracteristicas seleccionadas: Index(['Edad', 'Stance Time', 'Duration Time', 'Speed', 'Cycle Timel',
       'Step Timel', 'Swing Percentl', 'Swing Timel', 'Distancel',
       'Duration Timel', 'Speedl', 'Accelerationl'],
      dtype='object')
Mejor Exactitud: 0.7213450292397662


## Evaluación de métricas: 

In [1]:
def wrap_precision(y_test,y_pred):
    return metrics.precision_score(y_test,y_pred,pos_label='Positivo')

def wrap_recall(y_test,y_pred):
    return metrics.recall_score(y_test,y_pred,pos_label='Positivo')

def wrap_f1(y_test,y_pred):
    return metrics.f1_score(y_test,y_pred,pos_label='Positivo')

def wrap_kappa(y_test,y_pred):
    return metrics.cohen_kappa_score(y_test,y_pred)

In [10]:
x_rfe = x[:, rfecv.support_]

scores = {
    'acc':'accuracy', 
    'kappa' : metrics.make_scorer(wrap_kappa),
    'precision' : metrics.make_scorer(wrap_precision),
    'recall' : metrics.make_scorer(wrap_recall),
    'f1' : metrics.make_scorer(wrap_f1)
}

cv_scores = cross_validate(estimator=LogisticRegression(penalty='l2', C=0.5), X=x_rfe,y=y,
                           cv=10, return_train_score=False, scoring=scores)

print('Exactitud: ', np.average(cv_scores['test_acc']))
print('Kappa_Score: ', np.average(cv_scores['test_kappa']))
print()
print('Precisión: ', np.average(cv_scores['test_precision']))
print('Recall: ', np.average(cv_scores['test_recall']))
print('F1_score: ', np.average(cv_scores['test_f1']))

Exactitud:  0.7213450292397662
Kappa_Score:  0.4445169082125604

Precisión:  0.8134157509157509
Recall:  0.6555555555555556
F1_score:  0.6693983957219252


#### Interpretación de métricas:

El modelo tiene una buena exactitud muy similar a la encontrada en los modelos de árboles (ver 5. Trees), sin embargo no tiene tan buena sensibilidad (Recall) y ya que es para diagnóstico de Parkinson, es importante que el modelo pueda diferenciar mejor los pacientes positivos de los negativos.

Por esta razón, este modelo es la segunda opción para realizar diagnóstico de los modelos explorados.