In [95]:
from algorithms.knn import KNearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

from algorithms.cart import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier

from algorithms.logit_regression import LogisticRegression
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression

from algorithms.metrics import metrics, compare

import pandas as pd

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Consumo de drogas según características de personalidad

In [96]:
df = pd.read_csv("./datasets/drug_consumption.csv")
df.head()

Unnamed: 0,id,age,gender,education,country,ethnicity,Nscore,Escore,Oscore,Ascore,...,ecstacy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,1,0,0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,1,0,1,0,1,1,0,1,0,0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,0,0,0,0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,1,0,0,0,0,1,0,0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,0,0,0,0,0,0,1,1,0,0


## Feature Engineering

In [97]:
used_features = df[[
                    "age", "gender", "education",
                    "Nscore", "Escore", "Oscore",
                    "Ascore", "Cscore", "impulsiviness", "SS"]]

used_features.head()

Unnamed: 0,age,gender,education,Nscore,Escore,Oscore,Ascore,Cscore,impulsiviness,SS
0,0.49788,0.48246,-0.05921,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084
1,-0.07854,-0.48246,1.98437,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575
2,0.49788,-0.48246,-0.05921,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148
3,-0.95197,0.48246,1.16365,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084
4,0.49788,0.48246,1.98437,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575


In [118]:
X = used_features.to_numpy()
# Se selecciona el tipo de droga que se quiere evaluar
y = df.cannabis.to_numpy() 

print(X.shape)
print(y.shape)

(1885, 10)
(1885,)


### División del set de datos

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modelo de clasificación con K-Nearest Neighbors

### Optimización de hiperparámetros

In [120]:
def knn_objective(params):
    n_neighbors = 2 * int(params['n_neighbors']) + 1
    weights = params['weights']

    our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
    score = our_knn_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [124]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {
    'n_neighbors': hp.quniform('n_neighbors', 1, 150, 1),
    'weights': hp.choice('weights', ['uniform', 'distance'])
}

# Optimiza la precisión del modelo
fmin(knn_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=150,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 150/150 [02:42<00:00,  1.08s/trial, best loss: -0.7851458885941645]
{'n_neighbors': [36.0], 'weights': [0]}


### Entrenamiento del modelo

In [127]:
n_neighbors = 2 * int(best['n_neighbors'][0]) + 1
weights = 'uniform' if best['weights'][0] == 0 else 'distance'

our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
our_knn_y_pred = our_knn_classifier.predict(X_test)

sklearn_knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
sklearn_knn_y_pred = sklearn_knn_classifier.predict(X_test)

### Evaluación del modelo

In [128]:
our_knn_metrics = metrics(y_test, our_knn_y_pred)
sklearn_knn_metrics = metrics(y_test, sklearn_knn_y_pred)

compare(our_knn_metrics, sklearn_knn_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.785146,0.785146
Precision,0.789214,0.789214
Recall,0.785146,0.785146


## Modelo de clasificación con Árboles de Decisión

### Optimización de hiperparámetros

In [104]:
X = used_features
y = df.cannabis

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def cart_objective(params):
    max_depth = int(params['max_depth'])

    our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
    our_cart_classifier.fit(X_train, y_train)
    
    score = our_cart_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [105]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'max_depth': hp.quniform('max_depth', 10, 300, 1)}

# Optimiza la precisión del modelo
fmin(cart_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=10,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 25/25 [19:55<00:00, 47.83s/trial, best loss: -0.7214854111405835]
{'max_depth': [274.0]}


### Entrenamiento del modelo

In [106]:
max_depth = int(best['max_depth'][0])

our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
our_cart_classifier.fit(X_train, y_train)
our_cart_y_pred = our_cart_classifier.predict(X_test)

sklearn_cart_classifier = SklearnDecisionTreeClassifier(max_depth=max_depth)
sklearn_cart_classifier.fit(X_train,y_train)
sklearn_cart_classifier_y_pred = sklearn_cart_classifier.predict(X_test)

### Evaluación del modelo

In [107]:
our_cart_metrics = metrics(y_test, our_cart_y_pred)
sklearn_cart_metrics = metrics(y_test, sklearn_cart_classifier_y_pred)

compare(our_cart_metrics, sklearn_cart_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.721485,0.71618
Precision,0.746934,0.734449
Recall,0.721485,0.71618


## Modelo de clasificación con Regresión Logística

### Optimización de hiperparámetros

In [108]:
def logit_objective(params):
    lr = params['lr']
    num_iter = int(params['num_iter'])
    threshold = params['threshold']

    our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
    our_logit_classifier.fit(X_train, y_train)
    y_pred = our_logit_classifier.predict(X_test, threshold)
    
    score = our_logit_classifier.score(y_test, y_pred)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [109]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'lr': hp.uniform('lr', 0.001, 1.0 ),
                'num_iter': hp.quniform('num_iter', 1, 5000, 1),
                'threshold': hp.uniform('threshold', 0.01, 1.0)}

# Optimiza la precisión del modelo
fmin(logit_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=125,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 200/200 [16:32<00:00,  4.96s/trial, best loss: -0.7957559681697612]
{'lr': [0.37764938279536114], 'num_iter': [1996.0], 'threshold': [0.5878881704793822]}


### Entrenamiento del modelo

In [110]:
lr = best['lr'][0]
num_iter = int(best['num_iter'][0])
threshold = best['threshold'][0]

our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
our_logit_classifier.fit(X_train, y_train)
our_logit_y_pred = our_logit_classifier.predict(X_test, threshold)

sklearn_logit_classifier = SklearnLogisticRegression(max_iter=num_iter)
sklearn_logit_classifier.fit(X_train, y_train)
sklearn_logit_y_pred = sklearn_logit_classifier.predict(X_test)

### Evaluación del modelo

In [111]:
our_logit_metrics = metrics(y_test, our_logit_y_pred)
sklearn_logit_metrics = metrics(y_test, sklearn_logit_y_pred)

compare(our_logit_metrics, sklearn_logit_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.795756,0.785146
Precision,0.816376,0.790496
Recall,0.795756,0.785146
