<h1> <center> Proyecto #1  </h1>
<h2> <center> Explorando los Algoritmos de Aprendizaje Supervisado  </h2>

<div> <center>
  <h3> Integrantes: </h3>
  <ul style="list-style:none; padding:0">
    <li> Ricardo Gatgens Rodríguez </li>
    <li> Valeria Morales Alvarado </li>
  </ul>
</div>


In [1]:
import math
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression

from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import accuracy_score, precision_score, recall_score


# Algoritmo K-Nearst Neighbors

In [2]:
class KNearestNeighbors():
    def __init__(self, X_train, y_train, n_neighbors=5, n_classes=3, weights='uniform'):
        self.X_train = X_train
        self.y_train = y_train

        self.n_neighbors = n_neighbors
        self.n_classes = n_classes
        
        self.weights = weights

    def euclidian_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2, axis=1))

    def kneighbors(self, X_test, return_distance=False):
        dist = []
        neigh_ind = []

        point_dist = []
        for x_test in X_test:
            point_dist += [self.euclidian_distance(x_test, self.X_train)]

        for row in point_dist:
            enum_neigh = enumerate(row)
            sorted_neigh = sorted(enum_neigh,
                                  key=lambda x: x[1])[:self.n_neighbors]

            ind_list = [tup[0] for tup in sorted_neigh]
            dist_list = [tup[1] for tup in sorted_neigh]

            dist.append(dist_list)
            neigh_ind.append(ind_list)

        if return_distance:
            return np.array(dist), np.array(neigh_ind)

        return np.array(neigh_ind)

    def predict(self, X_test):
        if self.weights == 'uniform':
            neighbors = self.kneighbors(X_test)
            y_pred = np.array([
                np.argmax(np.bincount(self.y_train[neighbor]))
                for neighbor in neighbors
            ])
            return y_pred

        if self.weights == 'distance':
            distances, neighbors = self.kneighbors(
                X_test=X_test, return_distance=True)
            y_pred = []
            for i in range(len(X_test)):
                # Los k vecinos más cercanos
                neighbor = neighbors[i]
                # Distancias a los k vecinos más cercanos
                distance = distances[i]

                # Etiquetas de los k vecinos más cercanos
                neighbors_tag = self.y_train[neighbor]
                # Cantidad de vecinos por clase
                class_count = np.bincount(
                    neighbors_tag, minlength=self.n_classes)

                # Promedio de distancias a los vecinos de cada clase
                weights = []
                for j in range(self.n_classes):
                    # Vecinos de una clase dada
                    class_neighbors = np.where(neighbors_tag == j)
                    if len(class_neighbors[0]) > 0:
                        # Si hay vecinos de la clase j, calcula el promedio de sus distancias
                        class_weight = np.sum(
                            distance[class_neighbors]) / class_count[j]
                    else:
                        # Si no hay vecinos de la clase j, asigna un peso infinito
                        class_weight = float('inf')

                    # Guarda el peso de la clase j en el índice j
                    weights.append(class_weight)

                # Selecciona la clase con el menor peso
                y_pred.append(np.argmin(weights))

            return np.array(y_pred)

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return float(sum(y_pred == y_test)) / float(len(y_test))

# Algoritmo Clasification and Regression Trees

In [3]:
def entropy_func(c, n):
    return -(c*1.0/n)*math.log(c*1.0/n, 2)


def entropy_cal(c1, c2):
    if c1 == 0 or c2 == 0:
        return 0
    return entropy_func(c1, c1+c2) + entropy_func(c2, c1+c2)


def entropy_of_one_division(division):
    s = 0
    n = len(division)
    classes = set(division)
    for c in classes:
        n_c = sum(division == c)
        e = n_c*1.0/n * entropy_cal(sum(division == c),
                                    sum(division != c))
        s += e
    return s, n


def get_entropy(y_predict, y_real):
    if len(y_predict) != len(y_real):
        print('They have to be the same length')
        return None
    n = len(y_real)
    s_true, n_true = entropy_of_one_division(
        y_real[y_predict])
    s_false, n_false = entropy_of_one_division(
        y_real[~y_predict])
    s = n_true*1.0/n * s_true + n_false*1.0/n * s_false
    return s


class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth

    def fit(self, x, y, par_node={}):
        if par_node is None:
            return None
        elif len(y) == 0:
            return None
        elif self.all_same(y):
            return {'val': y.iloc[0]}
        elif self.depth >= self.max_depth:
            return None
        else:
            self.depth += 1

            col, cutoff, entropy = self.find_best_split_of_all(
                x, y)    # find one split given an information gain
            y_left = y[x.iloc[:, col] < cutoff]
            y_right = y[x.iloc[:, col] >= cutoff]
            par_node = {'col': x.columns[col], 'index_col': col,
                        'cutoff': cutoff,
                        'val': np.round(np.mean(y))}
            par_node['left'] = self.fit(
                x[x.iloc[:, col] < cutoff], y_left, {})
            par_node['right'] = self.fit(
                x[x.iloc[:, col] >= cutoff], y_right, {})
            self.trees = par_node
            return par_node

    # all features versus values, get best
    def find_best_split_of_all(self, x, y):
        # print(x.shape, y.shape)
        col = None
        min_entropy = 1
        cutoff = None
        # x.T es la transpuesta de x (x.T analiza las columnas)
        for i, c in enumerate(x.columns):
            # x.columns son las columnas
            entropy, cur_cutoff = self.find_best_split(x[c], y)
            if entropy == 0:    # find the first perfect cutoff. Stop Iterating
                return i, cur_cutoff, entropy
            elif entropy <= min_entropy:
                min_entropy = entropy
                col = i
                cutoff = cur_cutoff
        return col, cutoff, min_entropy

    # one feature versus values
    def find_best_split(self, col, y):
        min_entropy = 10
        n = len(y)
        for value in set(col):
            y_predict = col < value  # get which ones are less than
            my_entropy = get_entropy(y_predict, y)
            if my_entropy <= min_entropy:
                min_entropy = my_entropy
                cutoff = value
        return min_entropy, cutoff

    def all_same(self, items):
        return all(x == items.iloc[0] for x in items)

    def predict(self, x):
        tree = self.trees
        x = x.reset_index(drop=True)
        results = np.array([0]*x.shape[0])

        for i, row in x.iterrows():
            rowList = list(row)  # Convertir filas en listas
            results[i] = self._get_prediction(rowList)
        return results

    def _get_prediction(self, row):
        cur_layer = self.trees
        while cur_layer is not None and cur_layer.get('cutoff'):
            if row[cur_layer['index_col']] < cur_layer['cutoff']:
                cur_layer = cur_layer['left']
            else:
                cur_layer = cur_layer['right']

        if cur_layer is not None:
            return cur_layer.get('val')
        else:
            return 0

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return float(sum(y_pred == y_test)) / float(len(y_test))

# Algoritmo Logistic Regression

In [4]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept

    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        # weights initialization
        self.w = np.zeros(X.shape[1])

        for i in range(self.num_iter):
            z = np.dot(X, self.w)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h-y)) / y.size
            self.w -= self.lr * gradient

    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        return self.__sigmoid(np.dot(X, self.w))

    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

    def score(self, y_pred, y_test):
        return float(sum(y_pred == y_test)) / float(len(y_test))

# Métricas de evaluación

In [5]:
def get_metrics(y_test, y_pred):
    score = float(sum(y_pred == y_test)) / float(len(y_test))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(
        y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(
        y_test, y_pred, average='weighted', labels=np.unique(y_pred))

    metrics = [accuracy, precision, recall]

    return metrics

def compare_metrics(our_metrics, sklearn_metrics):
    res = pd.DataFrame([[our_metrics[0], sklearn_metrics[0]],
                        [our_metrics[1], sklearn_metrics[1]],
                        [our_metrics[2], sklearn_metrics[2]]],
                       ['Accuracy', 'Precision', 'Recall'],
                       ['Our Implementation', 'Sklearn\'s Implementation'])
    return res

# Calidad de vino rojo según características químicas

In [6]:
df = pd.read_csv("./datasets/winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


## Feature engineering

### Selección de características

In [7]:
features=['fixed acidity','volatile acidity', 'citric acid',
          'residual sugar','chlorides','free sulfur dioxide',
          'total sulfur dioxide', 'density', 'pH',
          'sulphates', 'alcohol']

X = df[features]

### Normalización de los datos

In [8]:
X = (X - X.min()) / (X.max() - X.min())
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846


### Conversión a problema de clasificación binaria

In [9]:
y = df.quality
Outcome = []

for i in y:
    if(y[i] <= 5):
        Outcome.append(0)
    else:
        Outcome.append(1)

df['Outcome'] = Outcome
y=df.Outcome
y_name = df['Outcome'].name

## Modelo de clasificación con K-Nearest Neighbors

### División del set de datos

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.3, random_state=45)

### Optimización de hiperparámetros

In [11]:
def knn_objective(params):
    n_neighbors = 2 * int(params['n_neighbors']) + 1
    weights = params['weights']

    our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
    score = our_knn_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [12]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {
    'n_neighbors': hp.quniform('n_neighbors', 1, 150, 1),
    'weights': hp.choice('weights', ['uniform', 'distance'])
}

# Optimiza la precisión del modelo
fmin(knn_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=150,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 150/150 [00:46<00:00,  3.24trial/s, best loss: -0.8875]           
{'n_neighbors': [57.0], 'weights': [0]}


### Entrenamiento del modelo

In [13]:
n_neighbors = 2 * int(best['n_neighbors'][0]) + 1
weights = 'uniform' if best['weights'][0] == 0 else 'distance'

our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
our_knn_y_pred = our_knn_classifier.predict(X_test)

sklearn_knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
sklearn_knn_y_pred = sklearn_knn_classifier.predict(X_test)

### Evaluación del modelo

In [14]:
our_knn_metrics = get_metrics(y_test, our_knn_y_pred)
sklearn_knn_metrics = get_metrics(y_test, sklearn_knn_y_pred)

compare_metrics(our_knn_metrics, sklearn_knn_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.8875,0.8875
Precision,0.871101,0.871101
Recall,0.8875,0.8875


## Modelo de clasificación con Árboles de Decisión

### División del set de datos

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=100)

### Optimización de hiperparámetros

In [16]:
def cart_objective(params):
    max_depth = int(params['max_depth'])

    our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
    our_cart_classifier.fit(X_train, y_train)
    
    score = our_cart_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [17]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'max_depth': hp.quniform('max_depth', 10, 300, 1)}

# Optimiza la precisión del modelo
fmin(cart_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=10,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 10/10 [03:44<00:00, 22.44s/trial, best loss: -0.86]  
{'max_depth': [52.0]}


### Entrenamiento del modelo

In [18]:
max_depth = int(best['max_depth'][0])

our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
our_cart_classifier.fit(X_train, y_train)
our_cart_y_pred = our_cart_classifier.predict(X_test)

sklearn_cart_classifier = SklearnDecisionTreeClassifier(max_depth=max_depth)
sklearn_cart_classifier.fit(X_train,y_train)
sklearn_cart_classifier_y_pred = sklearn_cart_classifier.predict(X_test)

### Evaluación del modelo

In [19]:
our_cart_metrics = get_metrics(y_test, our_cart_y_pred)
sklearn_cart_metrics = get_metrics(y_test, sklearn_cart_classifier_y_pred)

compare_metrics(our_cart_metrics, sklearn_cart_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.86,0.835
Precision,0.837068,0.848903
Recall,0.86,0.835


## Modelo de clasificación con Regresión Logística

### División del set de datos

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=100)

### Optimización de hiperparámetros

In [21]:
def logit_objective(params):
    lr = params['lr']
    num_iter = int(params['num_iter'])
    threshold = params['threshold']

    our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
    our_logit_classifier.fit(X_train, y_train)
    y_pred = our_logit_classifier.predict(X_test, threshold)
    
    score = our_logit_classifier.score(y_test, y_pred)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [22]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'lr': hp.uniform('lr', 0.01, 1.0 ),
                'num_iter': hp.quniform('num_iter', 1, 5000, 1),
                'threshold': hp.uniform('threshold', 0.01, 1.0)}

# Optimiza la precisión del modelo
fmin(logit_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=125,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 125/125 [04:37<00:00,  2.22s/trial, best loss: -0.88125]
{'lr': [0.8378462524728069], 'num_iter': [3306.0], 'threshold': [0.5300060835033367]}


### Entrenamiento del modelo

In [23]:
lr = best['lr'][0]
num_iter = int(best['num_iter'][0])
threshold = best['threshold'][0]

our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
our_logit_classifier.fit(X_train, y_train)
our_logit_y_pred = our_logit_classifier.predict(X_test, threshold)

sklearn_logit_classifier = SklearnLogisticRegression(max_iter=num_iter)
sklearn_logit_classifier.fit(X_train, y_train)
sklearn_logit_y_pred = sklearn_logit_classifier.predict(X_test)

### Evaluación del modelo

In [24]:
our_logit_metrics = get_metrics(y_test, our_logit_y_pred)
sklearn_logit_metrics = get_metrics(y_test, sklearn_logit_y_pred)

compare_metrics(our_logit_metrics, sklearn_logit_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.88125,0.8675
Precision,0.865129,0.836647
Recall,0.88125,0.8675


# Aprobación del curso Arquitectura de Computadores I según notas

In [25]:
df = pd.read_csv("./datasets/grades.csv")
df.head()

Unnamed: 0,Proyecto1,Proyecto2,Examen1,Taller1,Tarea1,Final,Resultado
0,100.0,100.0,53.3,80.0,76.0,80.0,1
1,0.0,0.0,12.6,40.0,0.0,5.0,0
2,100.0,45.0,49.6,100.0,100.0,80.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0
4,100.0,100.0,61.6,100.0,66.7,80.0,1


## Feature Engineering

### Selección de características

In [26]:
#Se seleccionaron las caracteristicas solicitadas en el enunciado
features=['Proyecto1','Proyecto2','Examen1','Tarea1']

X=df[features]
y=df['Resultado']

### Normalización de los datos

In [27]:
X = (X - X.min()) / (X.max() - X.min())
X.head()

Unnamed: 0,Proyecto1,Proyecto2,Examen1,Tarea1
0,1.0,1.0,0.616185,0.76
1,0.0,0.0,0.145665,0.0
2,1.0,0.45,0.57341,1.0
3,0.0,0.0,0.0,0.0
4,1.0,1.0,0.712139,0.667


## Modelo de clasificación con K-Nearest Neighbors

### División del set de datos

In [28]:
X_train,X_test,y_train,y_test=train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.50, random_state=100)

### Optimización de hiperparámetros

In [29]:
def knn_objective(params):
    n_neighbors = 2 * int(params['n_neighbors']) + 1
    weights = params['weights']

    our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
    score = our_knn_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [30]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {
    'n_neighbors': hp.quniform('n_neighbors', 1, 20, 1),
    'weights': hp.choice('weights', ['uniform', 'distance'])
}

# Optimiza la precisión del modelo
fmin(knn_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=20,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 20/20 [00:00<00:00, 202.89trial/s, best loss: -0.9245283018867925]
{'n_neighbors': [11.0], 'weights': [0]}


### Entrenamiento del modelo

In [31]:
n_neighbors = 2 * int(best['n_neighbors'][0]) + 1
weights = 'uniform' if best['weights'][0] == 0 else 'distance'

our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
our_knn_y_pred = our_knn_classifier.predict(X_test)

sklearn_knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
sklearn_knn_y_pred = sklearn_knn_classifier.predict(X_test)

### Evaluación del modelo

In [32]:
our_knn_metrics = get_metrics(y_test, our_knn_y_pred)
sklearn_knn_metrics = get_metrics(y_test, sklearn_knn_y_pred)

compare_metrics(our_knn_metrics, sklearn_knn_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.924528,0.924528
Precision,0.931891,0.931891
Recall,0.924528,0.924528


## Modelo de clasificación con Árboles de Decisión

### División del set de datos

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=100)

### Optimización de hiperparámetros

In [34]:
def cart_objective(params):
    max_depth = int(params['max_depth'])

    our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
    our_cart_classifier.fit(X_train, y_train)
    
    score = our_cart_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [35]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'max_depth': hp.quniform('max_depth', 10, 300, 1)}

# Optimiza la precisión del modelo
fmin(cart_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=10,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 10/10 [00:02<00:00,  3.82trial/s, best loss: -0.8490566037735849]
{'max_depth': [78.0]}


### Entrenamiento del modelo

In [36]:
max_depth = int(best['max_depth'][0])

our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
our_cart_classifier.fit(X_train, y_train)
our_cart_y_pred = our_cart_classifier.predict(X_test)

sklearn_cart_classifier = SklearnDecisionTreeClassifier(max_depth=max_depth)
sklearn_cart_classifier.fit(X_train,y_train)
sklearn_cart_classifier_y_pred = sklearn_cart_classifier.predict(X_test)

### Evaluación del modelo

In [37]:
our_cart_metrics = get_metrics(y_test, our_cart_y_pred)
sklearn_cart_metrics = get_metrics(y_test, sklearn_cart_classifier_y_pred)

compare_metrics(our_cart_metrics, sklearn_cart_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.849057,0.849057
Precision,0.856304,0.849057
Recall,0.849057,0.849057


## Modelo de clasificación con Regresión Logística

### División del set de datos

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=100)

### Optimización de hiperparámetros

In [39]:
def logit_objective(params):
    lr = params['lr']
    num_iter = int(params['num_iter'])
    threshold = params['threshold']

    our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
    our_logit_classifier.fit(X_train, y_train)
    y_pred = our_logit_classifier.predict(X_test, threshold)
    
    score = our_logit_classifier.score(y_test, y_pred)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [40]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'lr': hp.uniform('lr', 0.01, 1.0 ),
                'num_iter': hp.quniform('num_iter', 1, 5000, 1),
                'threshold': hp.uniform('threshold', 0.01, 1.0)}

# Optimiza la precisión del modelo
fmin(logit_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=125,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 125/125 [00:44<00:00,  2.80trial/s, best loss: -0.9433962264150944]
{'lr': [0.3554911209405314], 'num_iter': [2593.0], 'threshold': [0.3602958105895245]}


### Entrenamiento del modelo

In [41]:
lr = best['lr'][0]
num_iter = int(best['num_iter'][0])
threshold = best['threshold'][0]

our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
our_logit_classifier.fit(X_train, y_train)
our_logit_y_pred = our_logit_classifier.predict(X_test, threshold)

sklearn_logit_classifier = SklearnLogisticRegression(max_iter=num_iter)
sklearn_logit_classifier.fit(X_train, y_train)
sklearn_logit_y_pred = sklearn_logit_classifier.predict(X_test)

### Evaluación del modelo

In [42]:
our_logit_metrics = get_metrics(y_test, our_logit_y_pred)
sklearn_logit_metrics = get_metrics(y_test, sklearn_logit_y_pred)

compare_metrics(our_logit_metrics, sklearn_logit_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.943396,0.943396
Precision,0.947642,0.947642
Recall,0.943396,0.943396


# Consumo de drogas según características de personalidad

In [43]:
df = pd.read_csv("./datasets/drug_consumption.csv")
df.head()

Unnamed: 0,id,age,gender,education,country,ethnicity,Nscore,Escore,Oscore,Ascore,...,ecstacy,heroin,ketamine,legalh,lsd,meth,mushrooms,nicotine,semer,vsa
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,1,0,0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,1,0,1,0,1,1,0,1,0,0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,0,0,0,0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,1,0,0,0,0,1,0,0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,0,0,0,0,0,0,1,1,0,0


## Feature Engineering

### Selección de características

In [46]:
used_features = df[["age", "gender", "education",
                    "Nscore", "Escore", "Oscore",
                    "Ascore", "Cscore", "impulsiviness", "SS"]]

X = used_features
# Se selecciona el tipo de droga que se quiere evaluar
y = df.cannabis

## Modelo de clasificación con K-Nearest Neighbors

### División del set de datos

In [47]:
X_train,X_test,y_train,y_test=train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.20, random_state=100)

### Optimización de hiperparámetros

In [48]:
def knn_objective(params):
    n_neighbors = 2 * int(params['n_neighbors']) + 1
    weights = params['weights']

    our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
    score = our_knn_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [49]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {
    'n_neighbors': hp.quniform('n_neighbors', 1, 150, 1),
    'weights': hp.choice('weights', ['uniform', 'distance'])
}

# Optimiza la precisión del modelo
fmin(knn_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=150,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 150/150 [00:52<00:00,  2.83trial/s, best loss: -0.8010610079575596]
{'n_neighbors': [73.0], 'weights': [0]}


### Entrenamiento del modelo

In [50]:
n_neighbors = 2 * int(best['n_neighbors'][0]) + 1
weights = 'uniform' if best['weights'][0] == 0 else 'distance'

our_knn_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=n_neighbors, n_classes=2, weights=weights)
our_knn_y_pred = our_knn_classifier.predict(X_test)

sklearn_knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X_train, y_train)
sklearn_knn_y_pred = sklearn_knn_classifier.predict(X_test)

### Evaluación del modelo

In [51]:
our_knn_metrics = get_metrics(y_test, our_knn_y_pred)
sklearn_knn_metrics = get_metrics(y_test, sklearn_knn_y_pred)

compare_metrics(our_knn_metrics, sklearn_knn_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.801061,0.801061
Precision,0.797654,0.797654
Recall,0.801061,0.801061


## Modelo de clasificación con Árboles de Decisión

### División del set de datos

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

### Optimización de hiperparámetros

In [64]:
def cart_objective(params):
    max_depth = int(params['max_depth'])

    our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
    our_cart_classifier.fit(X_train, y_train)
    
    score = our_cart_classifier.score(X_test, y_test)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [65]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'max_depth': hp.quniform('max_depth', 10, 300, 1)}

# Optimiza la precisión del modelo
fmin(cart_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=12,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 12/12 [04:06<00:00, 20.56s/trial, best loss: -0.6896551724137931]
{'max_depth': [283.0]}


### Entrenamiento del modelo

In [66]:
max_depth = int(best['max_depth'][0])

our_cart_classifier = DecisionTreeClassifier(max_depth=max_depth)
our_cart_classifier.fit(X_train, y_train)
our_cart_y_pred = our_cart_classifier.predict(X_test)

sklearn_cart_classifier = SklearnDecisionTreeClassifier(max_depth=max_depth)
sklearn_cart_classifier.fit(X_train,y_train)
sklearn_cart_classifier_y_pred = sklearn_cart_classifier.predict(X_test)

### Evaluación del modelo

In [67]:
our_cart_metrics = get_metrics(y_test, our_cart_y_pred)
sklearn_cart_metrics = get_metrics(y_test, sklearn_cart_classifier_y_pred)

compare_metrics(our_cart_metrics, sklearn_cart_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.689655,0.681698
Precision,0.700902,0.697269
Recall,0.689655,0.681698


## Modelo de clasificación con Regresión Logística

### División del set de datos

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

### Optimización de hiperparámetros

In [58]:
def logit_objective(params):
    lr = params['lr']
    num_iter = int(params['num_iter'])
    threshold = params['threshold']

    our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
    our_logit_classifier.fit(X_train, y_train)
    y_pred = our_logit_classifier.predict(X_test, threshold)
    
    score = our_logit_classifier.score(y_test, y_pred)

    return {
    'loss': -score, # Función de pérdida a minimizar
    'status': STATUS_OK # Estatus de la evaluación (no se indefine, por lo que siempre será exitoso)
    }

In [59]:
# Almacena el resultado de todas las evaluaciones de la función objetivo
trials = Trials()

# Espacio de búsqueda de hiperparámetros
search_space = {'lr': hp.uniform('lr', 0.01, 1.0 ),
                'num_iter': hp.quniform('num_iter', 1, 5000, 1),
                'threshold': hp.uniform('threshold', 0.01, 1.0)}

# Optimiza la precisión del modelo
fmin(logit_objective,
     search_space,
     algo=tpe.suggest,
     max_evals=125,
     trials=trials)

# Muestra los mejores hiperparámetros
best = trials.best_trial['misc']['vals']
print(best)

100%|██████████| 125/125 [04:08<00:00,  1.99s/trial, best loss: -0.8037135278514589]
{'lr': [0.451150802331397], 'num_iter': [2458.0], 'threshold': [0.40992823292974306]}


### Entrenamiento del modelo

In [60]:
lr = best['lr'][0]
num_iter = int(best['num_iter'][0])
threshold = best['threshold'][0]

our_logit_classifier = LogisticRegression(lr=lr, num_iter=num_iter)
our_logit_classifier.fit(X_train, y_train)
our_logit_y_pred = our_logit_classifier.predict(X_test, threshold)

sklearn_logit_classifier = SklearnLogisticRegression(max_iter=num_iter)
sklearn_logit_classifier.fit(X_train, y_train)
sklearn_logit_y_pred = sklearn_logit_classifier.predict(X_test)

### Evaluación del modelo

In [61]:
our_logit_metrics = get_metrics(y_test, our_logit_y_pred)
sklearn_logit_metrics = get_metrics(y_test, sklearn_logit_y_pred)

compare_metrics(our_logit_metrics, sklearn_logit_metrics)

Unnamed: 0,Our Implementation,Sklearn's Implementation
Accuracy,0.803714,0.793103
Precision,0.798058,0.79407
Recall,0.803714,0.793103
