In [7]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Preprocesamiento

In [8]:
df = pd.read_csv('../Data/df.csv', index_col=0)
df.head(3)

Unnamed: 0_level_0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Warehouse_block_B,Warehouse_block_C,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_low,Product_importance_medium,Gender_M
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,4,2,177,3,44,1233,1,False,False,True,False,False,False,True,False,False
2,4,5,216,2,59,3088,1,False,False,False,True,False,False,True,False,True
3,2,2,183,4,48,3374,1,False,False,False,False,False,False,True,False,True


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10999 entries, 1 to 10999
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Customer_care_calls        10999 non-null  int64
 1   Customer_rating            10999 non-null  int64
 2   Cost_of_the_Product        10999 non-null  int64
 3   Prior_purchases            10999 non-null  int64
 4   Discount_offered           10999 non-null  int64
 5   Weight_in_gms              10999 non-null  int64
 6   Reached.on.Time_Y.N        10999 non-null  int64
 7   Warehouse_block_B          10999 non-null  bool 
 8   Warehouse_block_C          10999 non-null  bool 
 9   Warehouse_block_D          10999 non-null  bool 
 10  Warehouse_block_F          10999 non-null  bool 
 11  Mode_of_Shipment_Road      10999 non-null  bool 
 12  Mode_of_Shipment_Ship      10999 non-null  bool 
 13  Product_importance_low     10999 non-null  bool 
 14  Product_importance_medium  

In [10]:
df['Reached.on.Time_Y.N'].value_counts()

Reached.on.Time_Y.N
1    6563
0    4436
Name: count, dtype: int64

In [11]:
X = df.drop(columns=['Reached.on.Time_Y.N'])
y = df['Reached.on.Time_Y.N']

# FLAML

In [22]:
RANDOM_STATE = 22

In [25]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Entrenamiento
automl = AutoML(random_state=RANDOM_STATE)
automl.fit(X_train, y_train, task="classification", time_budget=60, verbose=0, seed=RANDOM_STATE)

# Predicciones
y_pred = automl.predict(X_test)

# Probabilidades (opcional, si tu modelo las soporta)
y_proba = automl.predict_proba(X_test)

# Métricas principales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")
print(f"✅ F1-score: {f1:.4f}")

# Reporte más completo
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

# Matriz de confusión
print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))

# Mejor modelo encontrado
print("\n--- Mejor modelo ---")
print(automl.best_estimator)
#print(automl.best_model)
print(automl.best_config)
print(f"Best accuracy (cv): {automl.best_loss:.4f}")


✅ Accuracy: 0.6745
✅ Precision: 0.6953
✅ Recall: 0.6745
✅ F1-score: 0.6781

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.57      0.72      0.64       872
           1       0.78      0.65      0.71      1328

    accuracy                           0.67      2200
   macro avg       0.67      0.68      0.67      2200
weighted avg       0.70      0.67      0.68      2200


--- Confusion Matrix ---
[[625 247]
 [469 859]]

--- Mejor modelo ---
xgboost
{'n_estimators': 4, 'max_leaves': 5, 'min_child_weight': np.float64(1.6452423122285615), 'learning_rate': np.float64(0.21955330141120502), 'subsample': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': np.float64(0.0017957174943059567), 'reg_lambda': np.float64(0.7298481028528299)}
Best accuracy (cv): 0.2541


In [26]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

def evaluate_sampling(X_train, y_train, X_test, y_test, sampler_name, sampler):
    # Aplicar sampling
    X_res, y_res = sampler.fit_resample(X_train, y_train)
    print(f"\n🔹 {sampler_name}: distribución original {Counter(y_train)} → nueva {Counter(y_res)}")

    # Entrenar modelo
    automl = AutoML()
    automl.fit(X_res, y_res, task="classification", time_budget=60, verbose=0, seed=RANDOM_STATE)

    # Predicciones
    y_pred = automl.predict(X_test)

    # Métricas
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"✅ Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print(confusion_matrix(y_test, y_pred))
    print(f"Mejor modelo: {automl.best_estimator}")

    return automl


# --- Random UnderSampling ---
rus = RandomUnderSampler(random_state=42)
automl_rus = evaluate_sampling(X_train, y_train, X_test, y_test, "RandomUnderSampler", rus)

# --- Random OverSampling ---
ros = RandomOverSampler(random_state=42)
automl_ros =evaluate_sampling(X_train, y_train, X_test, y_test, "RandomOverSampler", ros)



🔹 RandomUnderSampler: distribución original Counter({1: 5235, 0: 3564}) → nueva Counter({0: 3564, 1: 3564})
✅ Accuracy: 0.6809 | Precision: 0.8115 | Recall: 0.6809 | F1: 0.6710
              precision    recall  f1-score   support

           0       0.55      0.99      0.71       872
           1       0.98      0.48      0.65      1328

    accuracy                           0.68      2200
   macro avg       0.77      0.73      0.68      2200
weighted avg       0.81      0.68      0.67      2200

[[859  13]
 [689 639]]
Mejor modelo: rf

🔹 RandomOverSampler: distribución original Counter({1: 5235, 0: 3564}) → nueva Counter({1: 5235, 0: 5235})
✅ Accuracy: 0.6491 | Precision: 0.6578 | Recall: 0.6491 | F1: 0.6519
              precision    recall  f1-score   support

           0       0.55      0.62      0.58       872
           1       0.73      0.67      0.70      1328

    accuracy                           0.65      2200
   macro avg       0.64      0.64      0.64      2200
weight

In [None]:
import joblib 

joblib.dump(automl_rus, 'automl_rus.pkl')
joblib.dump(automl_ros, 'automl_ros.pkl')

# LazyClassifier

In [31]:
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [32]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None, random_state=RANDOM_STATE)
models, predictions = clf.fit(X_train_rus, X_test, y_train_rus, y_test)

print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
SVC                                0.68               0.73     0.73      0.66   
BernoulliNB                        0.68               0.72     0.72      0.67   
LGBMClassifier                     0.68               0.72     0.72      0.68   
QuadraticDiscriminantAnalysis      0.66               0.72     0.72      0.65   
RandomForestClassifier             0.68               0.71     0.71      0.67   
GaussianNB                         0.64               0.71     0.71      0.63   
AdaBoostClassifier                 0.64               0.70     0.70      0.62   
ExtraTreesClassifier               0.67               0.70     0.70      0.67   
NuSVC                              0.67               0.70     0.70      0.67   
BaggingClassifier                  0.67               0.70     0.70      0.67   
RidgeClassifierCV           

In [39]:
joblib.dump(clf, 'lc.pkl')

['lc.pkl']

# Pycaret

In [None]:
from pycaret.classification import *
clf1 = setup(data, target = 'Purchase', session_id=123, log_experiment=True, experiment_name='juice1')