# Solución Parcial Práctico 1

## Librerías utilizadas

In [8]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import mglearn
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, mean_absolute_percentage_error, r2_score
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np
from sklearn.model_selection import cross_val_score

# Punto 1

## Breast Cancer: (KNN, LogisticRegression)

In [10]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

La métrica de mayor importancia para la clasificación de cancer es recall puesto que se quiere limitar el número de falsos negativos y predecir con mayor presición los casos de cáncer positivo.

### i) GridSearchCV y Pipeline

In [20]:
#Modelo de KNN----------------------------------------------------
pipe_knn = Pipeline([
    ("knn", KNeighborsClassifier())
])

param_grid_knn = {
    'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8],
    'knn__weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(pipe_knn, param_grid=param_grid_knn, cv=5)
grid_knn.fit(X_train, y_train)
y_pred_knn = grid_knn.predict(X_test)

precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_pred_knn)

results_knn = pd.DataFrame({
    'Model': ['KNN'],
    'Precision': [precision_knn],
    'Recall': [recall_knn],
    'F1-score': [f1_knn],
    'AUC': [auc_knn]
})

# Modelo de regresión logística---------------------------------------------------------
pipe_logistic = Pipeline([
    ("logistic", LogisticRegression())
])

param_grid_logistic = {
    'logistic__C': [0.1, 1.0, 10.0, 100, 1000]
}

grid_logistic = GridSearchCV(pipe_logistic, param_grid=param_grid_logistic, cv=5)
grid_logistic.fit(X_train, y_train)
y_pred_logistic = grid_logistic.predict(X_test)

precision_logistic = precision_score(y_test, y_pred_logistic)
recall_logistic = recall_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)
auc_logistic = roc_auc_score(y_test, y_pred_logistic)

results_logistic = pd.DataFrame({
    'Model': ['Logistic'],
    'Precision': [precision_logistic],
    'Recall': [recall_logistic],
    'F1-score': [f1_logistic],
    'AUC': [auc_logistic]
})

results_combined = pd.concat([results_knn, results_logistic])
results_combined.set_index('Model', inplace=True)

print("\nTabla de resultados:")
print(results_combined)



Tabla de resultados:
          Precision    Recall  F1-score       AUC
Model                                            
KNN        0.955556  0.955556  0.955556  0.940042
Logistic   0.965517  0.933333  0.949153  0.938365


### ii) Manualmente

In [22]:
best_knn_score = -1
best_knn_parameters = {}

best_lr_score = -1
best_lr_parameters = {}

for neighbors in [1, 2, 3, 4, 5, 6, 7, 8]:
    for weights in ['uniform', 'distance']:
        knn = KNeighborsClassifier(n_neighbors=neighbors, weights=weights)
        scores = cross_val_score(knn, X_train, y_train, cv=5)
        score = np.mean(scores)
        if score > best_knn_score:
            best_knn_score = score
            best_knn_parameters = {'neighbors': neighbors, 'weights': weights}
            best_knn_model = knn.fit(X_train, y_train)

y_pred_knn = best_knn_model.predict(X_test)

precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_pred_knn)

for c in [0.1, 1.0, 10.0, 100, 1000]:
    lr = LogisticRegression(C=c, max_iter=1000)
    scores = cross_val_score(lr, X_train, y_train, cv=5)
    score = np.mean(scores)
    if score > best_lr_score:
        best_lr_score = score
        best_lr_parameters = {'C': c}
        best_lr_model = lr.fit(X_train, y_train)
        
y_pred_lr = best_lr_model.predict(X_test)

precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_pred_lr)

results_df = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression'],
    'Precision': [precision_knn, precision_lr],
    'Recall': [recall_knn, recall_lr],
    'F1-score': [f1_knn, f1_lr],
    'AUC': [auc_knn, auc_lr]
})

print(results_df)


                 Model  Precision    Recall  F1-score       AUC
0                  KNN   0.955556  0.955556  0.955556  0.940042
1  Logistic Regression   0.988235  0.933333  0.960000  0.957233


En primer lugar se puede observar que en general los modelos clasifican bien las muestras positivas de cancer. Veamos con más detalle las métricas de cada modelo:
**KNN**
- Precisión (0.955556): el 95.56% de las muestras clasificadas como positivas por el modelo KNN eran realmente positivas.
- Recall (0.955556): el 95.56% de todas las muestras positivas en el conjunto de datos fueron identificadas correctamente por el modelo.
- F1-score (0.955556): hay un buen equilibrio entre precisión y recall.
- AUC (0.940042): el modelo tiene una buena capaciedad discriminativa.

**Regresión Logística**
- Precisión (0.965517): el 96.55% de las muestras clasificadas como positivas por el modelo de regresión logística eran realmente positivas.
- Recall (0.933333): el 93.33% de todas las muestras positivas en el conjunto de datos fueron identificadas correctamente por el modelo.
- F1-score (0.949153): hay un buen equilibrio entre precisión y recall.
- AUC (0.938365): el modelo tiene una buena capacidad de discriminación, aunque ligeramente menor que el modelo KNN.

En conclusión, dado que la nuestra métrica de interés es *recall* puesto que nos interesa que el modelo tenga una buena capacidad de clasificar las muestras positivas del conjunto de datos, se selecciona el modelo **KNN** ya que es el modelo con el recall más alto. Esto indica que es capaz de identificar más muestras positivas de cáncer de mama del conjunto de datos, lo cual es fundamental para que los casos positivos no pasen por alto.

## Boston Housing: (LinearRegression, Ridge, Lasso)

In [25]:
X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### i) GridSearchCV y Pipeline

In [26]:
# Modelo de regresión lineal---------------------------------------------------------
linear = LinearRegression()

param_grid_linear = {
}

grid_linear = GridSearchCV(linear, param_grid=param_grid_linear, cv=5)
grid_linear.fit(X_train, y_train)
y_pred_linear = grid_linear.predict(X_test)

mape_linear = mean_absolute_percentage_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)

results_linear = pd.DataFrame({
    'Model': ['LinearRegression'],
    'MAPE': [mape_linear],
    'RMSE': [rmse_linear],
    'R^2': [r2_linear]
})

# Modelo Ridge--------------------------------------------------------------------------
ridge = Ridge()

param_grid_ridge = {
    'alpha': [0.1, 1.0, 10.0, 100],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
}

grid_ridge = GridSearchCV(ridge, param_grid=param_grid_ridge, cv=5)
grid_ridge.fit(X_train, y_train)
y_pred_ridge = grid_ridge.predict(X_test)

mape_ridge = mean_absolute_percentage_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

results_ridge = pd.DataFrame({
    'Model': ['Ridge'],
    'MAPE': [mape_ridge],
    'RMSE': [rmse_ridge],
    'R^2': [r2_ridge]
})

# Modelo Lasso-----------------------------------------------------------------------------
lasso = Lasso()

param_grid_lasso = {
    'alpha': [0.1, 1.0, 10.0, 100]
}

grid_lasso = GridSearchCV(lasso, param_grid=param_grid_lasso, cv=5)
grid_lasso.fit(X_train, y_train)
y_pred_lasso = grid_lasso.predict(X_test)

mape_lasso = mean_absolute_percentage_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

results_lasso = pd.DataFrame({
    'Model': ['Lasso'],
    'MAPE': [mape_lasso],
    'RMSE': [rmse_lasso],
    'R^2': [r2_lasso]
})


results_combined = pd.concat([results_linear, results_ridge, results_lasso])
results_combined.set_index('Model', inplace=True)

print("\nTabla de resultados:")
print(results_combined)



Tabla de resultados:
                      MAPE      RMSE       R^2
Model                                         
LinearRegression  0.154947  5.662962  0.607472
Ridge             0.140119  4.308045  0.772834
Lasso             0.163203  5.496564  0.630201


### ii) Manualmente

In [28]:
best_linear_score = -1
best_linear_parameters = {}

best_ridge_score = -1
best_ridge_parameters = {}

best_lasso_score = -1
best_lasso_parameters = {}

#Regresión lineal------------------------------------------------------
linear = LinearRegression()
scores = cross_val_score(linear, X_train, y_train, cv=5)
best_linear_score = np.mean(scores)
best_linear_model = linear.fit(X_train, y_train)

y_pred_linear = best_linear_model.predict(X_test)

mape_linear = mean_absolute_percentage_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)


#Regresión Ridge------------------------------------------------------
for alpha in [0.1, 1.0, 10.0, 100]:
    ridge = Ridge(alpha=alpha)
    scores = cross_val_score(ridge, X_train, y_train, cv=5)
    score = np.mean(scores)
    if score > best_ridge_score:
        best_ridge_score = score
        best_ridge_parameters = {'alpha': alpha}
        best_ridge_model = ridge.fit(X_train, y_train)
        
y_pred_ridge = best_ridge_model.predict(X_test)

mape_ridge = mean_absolute_percentage_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

#Regresión Lasso-------------------------------------------------------
for alpha in [0.1, 1.0, 10.0, 100]:
    lasso = Lasso(alpha=alpha)
    scores = cross_val_score(lasso, X_train, y_train, cv=5)
    score = np.mean(scores)
    if score > best_lasso_score:
        best_lasso_score = score
        best_lasso_parameters = {'alpha': alpha}
        best_lasso_model = lasso.fit(X_train, y_train)
        
y_pred_lasso = best_lasso_model.predict(X_test)

mape_lasso = mean_absolute_percentage_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)


results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge', 'Lasso'],
    'MAPE': [mape_linear, mape_ridge, mape_lasso],
    'RMSE': [rmse_linear, rmse_ridge, rmse_lasso],
    'R^2': [r2_linear, r2_ridge, r2_lasso]
})

print(results_df)


               Model      MAPE      RMSE       R^2
0  Linear Regression  0.154947  5.662962  0.607472
1              Ridge  0.140289  4.313987  0.772207
2              Lasso  0.163203  5.496564  0.630201


A primera vista se puede observar que los modelos tienen un ajuste aceptable. Veamos cada métrica por separado:

- MAPE: el modelo de Ridge Regression tiene el menor error absoluto medio porcentual (0.140), seguido por el modelo de Linear Regression (0.155) y el modelo de Lasso Regression (0.163). Esto sugiere que el modelo de Ridge es el más preciso en términos de la diferencia entre las predicciones y los valores reales.

- RMSE:el modelo de Ridge Regression tiene el menor error cuadrático medio (4.308), seguido por el modelo de Linear Regression (5.663) y el modelo de Lasso Regression (5.497). Esto sugiere que el modelo de Ridge tiene la menor dispersión de errores.

- R²: el modelo de Ridge Regression tiene el R² más alto (0.773), seguido por el modelo de Lasso Regression (0.630) y el modelo de Linear Regression (0.607). Esto sugiere que el modelo de Ridge tiene el mejor ajuste, explicando más varianza en los datos en comparación con los otros modelos.

En resumen, el modelo **Ridge** es el mejor modelo de los 3, ya que tiene los mejores MAPE y RMSE, y el más alto R^2, lo que sugiere una mayor precisión y capacidad de ajuste con respecto a los otros modelos