# Regresion logistica
La documentacion sobre el modelo la obtenemos de:  
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from preprocessing import cargarDatasets
from preprocessing import prepararSet
from preprocessing import ingenieriaDeFeaturesVariablesNormalizadas
from preprocessing import ingenieriaDeFeauturesVariablesNormalizadasME

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score,mean_squared_error
from sklearn.model_selection import train_test_split,GridSearchCV

## Carga y preprocesamiento de los datos

Cargamos los datasets y preparamos los datos para entrenar la regresion.

In [2]:
train_df,final_df = cargarDatasets()
train_df = prepararSet(train_df)

In [3]:
X,y,df,y_encoder = ingenieriaDeFeaturesVariablesNormalizadas(train_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=117, test_size=0.1, stratify=y)

X_2,y_2,df_2,y_encoder_2,meanEncoding = ingenieriaDeFeauturesVariablesNormalizadasME(train_df)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, random_state=117, test_size=0.1, stratify=y_2)

Queremos encontrar los mejores hiperparametros para la regresion. Con estos hiperparametros buscamos minimizar el error en las predicciones, construyendo un modelo robusto y regularizado, para que logre generalizar para datos futuros.

Parametros:
* C: que tanto regularizamos. Similar al C de SVM, cuanto mas pequeño este valor mas fuerte sera la regularizacion.  
* penalty: tipo de regularizacion, estas pueden ser l1 (lasso), l2 (ridge) o elastic net.


In [4]:
params = [
    {'penalty' : ['l1','l2','elasticnet'],
     'C' : [0.001,0.005,0.01,0.05,0.1,0.3,1,3,5,10,30,50,55,75,100,120,150,],   
    }]

In [None]:
regresion1 = LogisticRegression(random_state = 0)

gscv1 = GridSearchCV(
    regresion1, params, scoring='roc_auc', n_jobs=-1, cv=2, return_train_score=True
).fit(X_train, y_train)

print(f"Best score: {gscv1.best_score_}")
print(f"Best params {gscv1.best_params_}")

Obtenemos:  
Best score: 0.8930353283762277  
Best params {'C': 0.3, 'penalty': 'l2'}

In [None]:
regresion2 = LogisticRegression(random_state = 0)

gscv2 = GridSearchCV(
    regresion2, params, scoring='roc_auc', n_jobs=-1, cv=2, return_train_score=True
).fit(X_train_2, y_train_2)

print(f"Best score: {gscv2.best_score_}")
print(f"Best params {gscv2.best_params_}")

Obtenemos:  
Best score: 0.888072427164029  
Best params {'C': 0.1, 'penalty': 'l2'}

Nos quedamos con la primera regresion, para la cual obtuvimos un roc score ligeramente mejor. 

In [None]:
params = {'C': 0.1, 'penalty': 'l2'}

regresion1.set_params(**params)
regresion1.fit(X_train, y_train)
y_pred = regresion1.predict(X_test)

## Evaluacion de metricas  
Las metricas a utilizar son:
* AUC-ROC  
* Matriz de confusión  
* Accuracy  
* Precisión  
* Recall 
Primero observamos las metricas obtenidas de las predicciones con el set de prueba.

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, regresion1.predict(X_test)))

### Curva AUC ROC

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, plot_roc_curve

In [None]:
plot_roc_curve(regresion1,X_test,y_test,response_method='predict_proba')
plt.show()

### Matriz de confusion

In [None]:
from sklearn.metrics import plot_confusion_matrix

fig, ax = plt.subplots(figsize=(15, 7))
plt.grid(False)
plot_confusion_matrix(
    regresion1, X_test, y_test, cmap=plt.cm.Blues, display_labels=['no tiene alto valor adquisitivo', 'tiene alto valor adquisitivo'], ax=ax
)
plt.show()

### Accuracy

In [None]:
accuracy_score(y_test, y_pred) 

### Precision

In [None]:
precision_score(y_test, y_pred)

### Recall

In [None]:
recall_score(y_test, y_pred)

## Predicciones sobre el set de hold out

In [None]:
from preprocessing import prepararSetDeHoldOutRegresion

In [None]:
final_df_copy = prepararSetDeHoldOutRegresion(final_df)
ho_prediction = regresion1.predict(final_df_copy)

final_df['tiene_alto_valor_adquisitivo'] = ho_prediction
final_df.to_csv('predicciones/regresion.csv')