---
title: "Caso Allianz"
format:
  html:
    embed-resources: true
---

## Librerías

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#!pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

import warnings 
warnings.filterwarnings("ignore")

## Funciones

In [None]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df
  
def crear_surrogate_columnas(df, columnas):
    for col in columnas:
        # Crear una nueva columna con el nombre original seguido de '.surrogate'
        nueva_columna = col + '.surrogate'
        # Asignar 1 si el valor es nulo y 0 si no lo es
        df[nueva_columna] = df[col].isnull().astype(int)
    return df

# Exploración de datos

In [None]:
df = pd.read_csv("data-raw/bd_allianz.csv")
df.info()

duplicadas = df[df.duplicated()]

print("\nFilas duplicadas:")
print(duplicadas)

df = df.drop_duplicates()

## Transformación de tipo de datos

In [None]:
# Convert to list if it's not already
convertir = df.select_dtypes(exclude=['float64', 'int64']).columns.tolist()

# Add a new element
convertir.extend(['Customer_ID', "Is_direct_debit"])

df[convertir] = df[convertir].apply(lambda x: x.astype('category'))

df.info()

df["Broker_cor"] = df["Broker_cor"].str.replace(',', '').astype("float64")

In [None]:
df.describe()

df.isnull().sum()/df.shape[0]

# Análisis exploratorio de datos

In [None]:
df["Is_direct_debit"].value_counts()

In [None]:
sns.countplot(data=df, x="Is_direct_debit", hue = "Is_direct_debit")
plt.xlabel("Is Direct Debit")
plt.ylabel("Count")
plt.title("Distribución de cuentas domiciliados")
plt.show()

In [None]:
cat_cols = df.select_dtypes(include = "category").columns
cat_cols = cat_cols.drop(labels=['Broker_account_number', 'Contract_number', 'Customer_ID'],)

## Distribución de variables categóricas

In [None]:
#| label: gráficos
# Set up the figure and axes for six subplots (3 rows, 2 columns)

# Increase the height to allow more space for each subplot
fig, axs = plt.subplots(nrows=6, ncols=1, figsize=(8, 20))  # Adjusted height

# List of the columns to plot
columns = ["Broker_urbanization", "Customer_urbanization", "Broker_province", 
           "Customer_province", "Customer_age", "Customer_type"]

# Loop through each column and create a countplot
for i, col in enumerate(columns):
    sns.countplot(data=df, x=col, hue="Is_direct_debit", ax=axs[i])
    axs[i].set_title(f"Distribution of {col}")
    axs[i].set_xlabel(col)
    axs[i].set_ylabel("Count")
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=45, ha='right')

# Adjust layout for better spacing
fig.tight_layout(h_pad=2, w_pad=2)  # Add horizontal padding

# Show the plots
plt.show()

## Distribución de variables numéricas

In [None]:
scaler = StandardScaler()
df["Broker_cor_scaled"] = scaler.fit_transform(df[["Broker_cor"]])

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x="Broker_cor_scaled", hue="Is_direct_debit")
plt.title("Violin Plot of Scaled Broker_cor")
plt.xlabel("Broker_cor_scaled")
plt.show()

In [None]:
df_majority = df[df['Is_direct_debit']==0] 
df_minority = df[df['Is_direct_debit']==1] 

# muestreo ascendente de la clase minoritaria
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # muesta con reemplazo 
                                 n_samples= 357183, # para que coincida con la clase mayoritaria
                                 random_state=0)   # resultados reproducible

# Combinar la clase mayoritaria con la muestra ascendente de la clase minoritaria 
df = pd.concat([df_minority_upsampled, df_majority])

## ~~*Invención*~~ Balanceo de datos

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="Is_direct_debit", hue = "Is_direct_debit")
plt.xlabel("Is Direct Debit")
plt.ylabel("Count")
plt.title("Distribución de cuentas domiciliados")
plt.show()

## One hot encoding

In [None]:
#| label: one hot encoding
df_encoded = pd.get_dummies(df, columns=["Broker_urbanization", "Customer_urbanization", 
                                         "Broker_province", "Customer_province", 
                                         "Customer_age", "Customer_type"])

df_encoded.shape                                         

# Modelos

## 1. Definiendo vector de características (X) y variable target (y)

In [None]:
X = df_encoded.drop('Is_direct_debit', axis=1)
y = df_encoded['Is_direct_debit']