# Proyecto final grupo 7: Clasificación binaria (aprobar o denegar préstamos)
Realizada por:
- Jaime Benedí
- Miguel Sevilla

Objetivo : Predecir si un préstamo será pagado en su totalidad o terminará en impago, ayudando a la toma de decisiones de la SBA o de la entidad financiera.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


## 1. Importar los datos de Kaggle
- Carga y visión general
- Estadísticas descriptivas: Media, mediana, desviación, percentiles para variables numéricas
- Identificar colmnas con nulos
- Correlación y relaciones (matriz de correlación)

In [4]:
DATASET_PATH : str = './data/SBAnational.csv'

In [5]:
original_dataset = pd.read_csv(DATASET_PATH, low_memory=False)
original_dataset

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,N,Y,,31-May-97,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,N,N,,31-Dec-97,"$287,000.00",$0.00,P I F,$0.00,"$287,000.00","$215,250.00"
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,...,N,Y,,30-Jun-97,"$35,000.00",$0.00,P I F,$0.00,"$35,000.00","$28,000.00"
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,...,N,N,,14-May-97,"$229,000.00",$0.00,P I F,$0.00,"$229,000.00","$229,000.00"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899159,9995573004,FABRIC FARMS,UPPER ARLINGTON,OH,43221,JPMORGAN CHASE BANK NATL ASSOC,IL,451120,27-Feb-97,1997,...,0,N,,30-Sep-97,"$70,000.00",$0.00,P I F,$0.00,"$70,000.00","$56,000.00"
899160,9995603000,FABRIC FARMS,COLUMBUS,OH,43221,JPMORGAN CHASE BANK NATL ASSOC,IL,451130,27-Feb-97,1997,...,Y,N,,31-Oct-97,"$85,000.00",$0.00,P I F,$0.00,"$85,000.00","$42,500.00"
899161,9995613003,"RADCO MANUFACTURING CO.,INC.",SANTA MARIA,CA,93455,"RABOBANK, NATIONAL ASSOCIATION",CA,332321,27-Feb-97,1997,...,N,N,,30-Sep-97,"$300,000.00",$0.00,P I F,$0.00,"$300,000.00","$225,000.00"
899162,9995973006,"MARUTAMA HAWAII, INC.",HONOLULU,HI,96830,BANK OF HAWAII,HI,0,27-Feb-97,1997,...,N,Y,8-Mar-00,31-Mar-97,"$75,000.00",$0.00,CHGOFF,"$46,383.00","$75,000.00","$60,000.00"


In [None]:
# Primer vistazo rápido
original_dataset.head()

# Información general del dataset
original_dataset.info()

# Estadísticas descriptivas
original_dataset.describe()

# Valores nulos
original_dataset.isnull().sum().sort_values(ascending=False).head(20)

# Tipos de columnas
print(original_dataset.dtypes.value_counts())


## 2. Preparar el dataset
- Eliminar columnas irrelevantes
- Transformaciones (codificar variables categoricas)
- Convertir fechas a características (antigüedad del préstamo, días hasta cobro, etc)
- Normalizar/Estandarizar


In [None]:
# 1. Eliminación de columnas poco informativas o redundantes
columns_to_drop = ['Name', 'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS', 'ApprovalDate', 
                   'DisbursementDate', 'ChgOffDate', 'BalanceGross', 'MIS_Status', 'LoanNr_ChkDgt', 'Address']
original_dataset = original_dataset.drop(columns=columns_to_drop, errors='ignore')

# 2. Tratamiento de nulos: Eliminamos filas donde falte el objetivo (MIS_Status)
original_dataset = original_dataset.dropna(subset=['MIS_Status'])

# 3. Codificación del Target (MIS_Status: PIF (Paid In Full) -> 1 / CHGOFF (Charged Off) -> 0)
original_dataset['MIS_Status'] = original_dataset['MIS_Status'].map({'PIF': 1, 'CHGOFF': 0})

# 4. Codificación de variables categóricas (convertimos las columnas tipo objeto en variables dummy)
categorical_cols = original_dataset.select_dtypes(include=['object']).columns
original_dataset = pd.get_dummies(original_dataset, columns=categorical_cols, drop_first=True)

# 5. Normalización de los datos numéricos
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_cols = original_dataset.select_dtypes(include=['float64', 'int64']).columns.drop('MIS_Status')
original_dataset[numeric_cols] = scaler.fit_transform(original_dataset[numeric_cols])

# Mostramos la forma final del dataset
print(f"Dataset preparado con {original_dataset.shape[0]} filas y {original_dataset.shape[1]} columnas.")
original_dataset.head()



## 3. Aplicación y evaluación del modelo
- Selección del modelo
- Validación (entrenamiento)
- Metricas (Matriz de confusión, Accuracy, Precision, Recall, F1-score)

In [None]:
# 1. Separar características (X) y etiqueta (y)
X = original_dataset.drop('MIS_Status', axis=1)
y = original_dataset['MIS_Status']

# 2. Dividir en conjunto de entrenamiento y test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(f'Tamaño de Train: {X_train.shape}')
print(f'Tamaño de Test: {X_test.shape}')


In [None]:
# 1. Entrenar un modelo de Regresión Logística
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# 2. Predicciones
y_pred_logreg = logreg.predict(X_test)

# 3. Evaluar el modelo
print("Resultados - Regresión Logística:")
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_logreg):.4f}")

## 4. Técnicas XAI