In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Base.csv")
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


In [4]:
df.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.011029,0.562696,0.493694,16.718568,86.587867,33.68908,1.025705,8.661499,1572.692049,5665.296605,...,0.889676,10.839303,0.222988,515.85101,0.025242,7.54494,0.576947,1.018312,0.0,3.288674
std,0.104438,0.290343,0.289125,44.04623,88.406599,12.025799,5.381835,20.236155,1005.374565,3009.380665,...,0.313293,12.116875,0.416251,487.559902,0.156859,8.033106,0.494044,0.180761,0.0,2.209994
min,0.0,0.1,1e-06,-1.0,-1.0,10.0,4.03686e-09,-15.530555,1.0,-170.603072,...,0.0,-1.0,0.0,190.0,0.0,-1.0,0.0,-1.0,0.0,0.0
25%,0.0,0.3,0.225216,-1.0,19.0,20.0,0.007193246,-1.181488,894.0,3436.365848,...,1.0,-1.0,0.0,200.0,0.0,3.103053,0.0,1.0,0.0,1.0
50%,0.0,0.6,0.492153,-1.0,52.0,30.0,0.01517574,-0.830507,1263.0,5319.769349,...,1.0,5.0,0.0,200.0,0.0,5.114321,1.0,1.0,0.0,3.0
75%,0.0,0.8,0.755567,12.0,130.0,40.0,0.02633069,4.984176,1944.0,7680.717827,...,1.0,25.0,0.0,500.0,0.0,8.866131,1.0,1.0,0.0,5.0
max,1.0,0.9,0.999999,383.0,428.0,90.0,78.4569,112.956928,6700.0,16715.565404,...,1.0,32.0,1.0,2100.0,1.0,85.899143,1.0,2.0,0.0,7.0


In [6]:
# Convertir variables categóricas a tipo 'category'
df['payment_type'] = df.payment_type.astype('category')
df['employment_status'] = df.employment_status.astype('category')
df['housing_status'] = df.housing_status.astype('category')
df['source'] = df.source.astype('category')
df['device_os'] = df.device_os.astype('category')

In [8]:
# Aplicar One-Hot Encoding
df = pd.get_dummies(df, drop_first=True)

In [10]:
# Separar variables predictoras (X) y variable objetivo (y)
X = df.drop(['fraud_bool'], axis=1)
y = df['fraud_bool']

In [12]:
# Separar datos de entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [14]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700000, 47) (300000, 47) (700000,) (300000,)


In [28]:
# Aplicar Borderline-SMOTE
smote = BorderlineSMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [30]:
# Escalar datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Entrenar modelo SVM lineal
svm_model = LinearSVC(max_iter=10000, random_state=42)
svm_model.fit(X_train_scaled, y_train_smote)

In [34]:
# Predicciones
y_pred = svm_model.predict(X_test_scaled)

In [36]:
# Evaluación
print("Matriz de Confusión:")
print(confusion_matrix(y_test, y_pred))

Matriz de Confusión:
[[280361  16330]
 [  2059   1250]]


In [38]:
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))


Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.99      0.94      0.97    296691
           1       0.07      0.38      0.12      3309

    accuracy                           0.94    300000
   macro avg       0.53      0.66      0.54    300000
weighted avg       0.98      0.94      0.96    300000



In [None]:
📊 Análisis Rápido del Resultado
Métrica	Antes de SMOTE	Con BorderlineSMOTE
Recall (1)	~0.00 - 0.04	0.38 ✅
Precision (1)	Alta pero engañosa	0.07 (más realista)
F1-score (1)	~0.00 - 0.06	0.12 (aumento claro)
Accuracy	98-99%	94% (esperado descenso)

In [None]:
¿Qué significa esto?
BorderlineSMOTE está generando mejores ejemplos sintéticos en los "bordes", lo cual ayuda al modelo a detectar fraudes, aunque aún con baja precisión.

Más falsos positivos (16330), pero más fraudes captados (1250 de 3309).