In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [24]:
labels_f = pd.read_csv('../../data/labels_f_filtrado.csv')
datos = pd.read_csv('../../data/datos_filtrado.csv')

In [25]:
atributos = labels_f.columns.tolist()

In [26]:
atributos.remove('person')
atributos.remove('label')

In [27]:
y = labels_f['label'].ravel()
X = labels_f.loc[:,atributos]

In [28]:
def aplicarSmote(X_train_p, y_train_p, X_test):
    smote = SMOTE(ratio='minority')
    X_smt_train, y_smt_train = smote.fit_sample(X_train_p, y_train_p)
    #X_smt_train = pd.DataFrame(X_smt_train)
    #X_smt_train.columns = X_test.columns
    return (X_smt_train, y_smt_train)

In [29]:
def aplicarRus(X_train_p, y_train_p,X_test_p):
    rus= RandomUnderSampler(return_indices=True)
    #id_rus son los índices
    X_rus_train, y_rus_train, id_rus = rus.fit_sample(X_train_p, y_train_p)
    #X_rus_train = pd.DataFrame(X_rus_train)
    #X_rus_train.columns = X_test_p.columns
    return(X_rus_train, y_rus_train)

In [30]:
seed = 7

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= seed)

In [32]:
X_rus_train, y_rus_train = aplicarRus(X_train,y_train,X_test)
X_smt_train, y_smt_train = aplicarSmote(X_train,y_train,X_test)

In [50]:
num_folds = 10
seed = 7
scoring = 'accuracy'

In [51]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('KNN', KNeighborsClassifier())) 
models.append(('CART', DecisionTreeClassifier())) 
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [52]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_rus_train, y_rus_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



LR: 0.738309 (0.037134)




LDA: 0.750959 (0.044886)
KNN: 0.541752 (0.037147)
CART: 0.675570 (0.030670)
NB: 0.596618 (0.280828)




SVM: 0.001903 (0.002907)


dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])