In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib
import yaml
import os 
import keras_tuner as kt
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam 

In [24]:
store_data = pd.read_csv('C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN_Rodrigo_Meza_Ortiz/data/processed/store_data_processed.csv')
train_data = pd.read_csv('C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN_Rodrigo_Meza_Ortiz/data/processed/train_data_processed.csv') 
test_data = pd.read_csv('C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN_Rodrigo_Meza_Ortiz/data/processed/test_data_processed.csv') 

In [25]:
X = train_data.drop(columns=['Sales', 'Date','Open', 'Unnamed: 0', 'Quarter'])  
y = train_data['Sales'] 

In [26]:
X.head()

Unnamed: 0,Store,DayOfWeek,Customers,Promo,SchoolHoliday,Year,Month,WeekOfYear,IsHoliday,Sales_Lag1,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,2,668,0,1,2013,1,1,1,0.0,True,False,False,False
1,1,3,578,0,1,2013,1,1,1,5530.0,True,False,False,False
2,1,4,619,0,1,2013,1,1,1,4327.0,True,False,False,False
3,1,5,635,0,1,2013,1,1,1,4486.0,True,False,False,False
4,1,0,785,1,1,2013,1,2,1,0.0,True,False,False,False


In [27]:
X[['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']] = X[['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']].astype(int)

In [28]:
X.head() 

Unnamed: 0,Store,DayOfWeek,Customers,Promo,SchoolHoliday,Year,Month,WeekOfYear,IsHoliday,Sales_Lag1,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,2,668,0,1,2013,1,1,1,0.0,1,0,0,0
1,1,3,578,0,1,2013,1,1,1,5530.0,1,0,0,0
2,1,4,619,0,1,2013,1,1,1,4327.0,1,0,0,0
3,1,5,635,0,1,2013,1,1,1,4486.0,1,0,0,0
4,1,0,785,1,1,2013,1,2,1,0.0,1,0,0,0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [30]:
X_train.head()

Unnamed: 0,Store,DayOfWeek,Customers,Promo,SchoolHoliday,Year,Month,WeekOfYear,IsHoliday,Sales_Lag1,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
308140,407,0,270,0,0,2015,1,4,0,0.0,1,0,0,0
32793,44,4,623,0,1,2013,10,40,1,0.0,1,0,0,0
585299,774,2,752,1,0,2014,3,10,1,6783.0,0,0,0,0
224730,299,1,477,1,0,2014,2,6,1,6527.0,0,0,0,0
571561,756,3,2655,1,1,2014,7,31,1,18370.0,0,0,0,0


In [31]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 

In [32]:
import joblib

scaler_path = 'C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN_Rodrigo_Meza_Ortiz/data/processed/scaler.pkl'
joblib.dump(scaler, scaler_path) 

['C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN_Rodrigo_Meza_Ortiz/data/processed/scaler.pkl']

In [33]:
X_train_scaled_df = pd.DataFrame(X_train, columns=X.columns)

In [None]:
# CODIGO DE RED NEURONAL 

In [34]:
import keras_tuner as kt
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

def build_model(hp):
    model = Sequential()
    model.add(Dense(
        units=hp.Int('units_1', min_value=64, max_value=512, step=64),  
        activation='relu',
        input_dim=X_train.shape[1]
    ))
    model.add(Dropout(hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)))  
    model.add(Dense(
        units=hp.Int('units_2', min_value=64, max_value=256, step=64),  
        activation='relu'
    ))
    model.add(Dense(1))  

    model.compile(
        optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='mean_squared_error'
    )
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,  
    executions_per_trial=1,
    directory='my_new_dir',
    project_name='new_tuning_project'
)

tuner.search(X_train, y_train, epochs=10, validation_split=0.2, verbose=1)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Mejor número de unidades en la primera capa: {best_hps.get('units_1')}")
print(f"Mejor tasa de dropout: {best_hps.get('dropout')}")
print(f"Mejor número de unidades en la segunda capa: {best_hps.get('units_2')}")
print(f"Mejor tasa de aprendizaje: {best_hps.get('learning_rate')}")




Reloading Tuner from my_new_dir\new_tuning_project\tuner0.json
Mejor número de unidades en la primera capa: 512
Mejor tasa de dropout: 0.1
Mejor número de unidades en la segunda capa: 128
Mejor tasa de aprendizaje: 0.001


In [36]:
red_neuronal_model = tuner.hypermodel.build(best_hps)

red_neuronal_model.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=1)

eval_result = red_neuronal_model.evaluate(X_test, y_test)
print(f"Pérdida en el conjunto de prueba: {eval_result}")

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 5385586.5000 - val_loss: 1270909.3750
Epoch 2/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 1296180.6250 - val_loss: 1229449.7500
Epoch 3/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 1252167.1250 - val_loss: 1190548.0000
Epoch 4/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 1228268.2500 - val_loss: 1189014.0000
Epoch 5/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 1218150.3750 - val_loss: 1172895.1250
Epoch 6/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1ms/step - loss: 1204561.8750 - val_loss: 1153990.2500
Epoch 7/100
[1m16887/16887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - loss: 1199364.6250 - val_loss: 1178271.7500
Epoch 8/100
[1m16887/16887[0m [32m━━━━━━━━━

In [37]:
import numpy as np

y_pred = red_neuronal_model.predict(X_test) 

[1m5278/5278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 463us/step


In [38]:
epsilon = 1e-10  
y_test_np = np.array(y_test)  
y_pred_np = np.array(y_pred).flatten()  

percent_error = np.abs((y_test_np - y_pred_np) / (y_test_np + epsilon))

rmspe_value = np.sqrt(np.mean(percent_error ** 2))

print(f"RMSPE en el conjunto de prueba: {rmspe_value:.4f}") 

RMSPE en el conjunto de prueba: 0.1624


In [39]:
red_neuronal_model.save('C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN/models/modelo_red_neuronal.h5') 



In [40]:
print(X_train_scaled_df.dtypes) 

Store             float64
DayOfWeek         float64
Customers         float64
Promo             float64
SchoolHoliday     float64
Year              float64
Month             float64
WeekOfYear        float64
IsHoliday         float64
Sales_Lag1        float64
StateHoliday_0    float64
StateHoliday_a    float64
StateHoliday_b    float64
StateHoliday_c    float64
dtype: object


In [41]:
print(X_train_scaled_df[['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']].head()) 

   StateHoliday_0  StateHoliday_a  StateHoliday_b  StateHoliday_c
0        0.392959       -0.028364       -0.012878       -0.008774
1        0.392959       -0.028364       -0.012878       -0.008774
2       -2.544795       -0.028364       -0.012878       -0.008774
3       -2.544795       -0.028364       -0.012878       -0.008774
4       -2.544795       -0.028364       -0.012878       -0.008774


In [42]:
X_train_scaled_df[['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']] = \
    X_train_scaled_df[['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']].astype(int) 

In [None]:
# CODIGO DE MODELOS DE APRENDIZAJE 

In [43]:
models = {
    'XGBoost': xgb.XGBRegressor(random_state=42), 
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42), 
    'Ridge Regression': Ridge(random_state=42),
    'Clustering': KMeans(n_clusters=4, random_state=42)
} 

In [44]:
results = {}
for name, model in models.items():
    model.fit(X_train_scaled_df, y_train)
    predictions = model.predict(X_test)
    rmspe = np.sqrt(np.mean(((y_test - predictions) / y_test) ** 2))
    results[name] = rmspe
    print(f"{name} RMSPE: {rmspe:.4f}") 

    joblib.dump(model, f'C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN/models/{name.replace(" ", "_").lower()}_model.pkl') 

XGBoost RMSPE: 0.1628




Linear Regression RMSPE: 0.2660




Decision Tree RMSPE: 0.1714




Random Forest RMSPE: 0.1221




Gradient Boosting RMSPE: 0.2115
Ridge Regression RMSPE: 0.2660


found 0 physical cores < 1
  File "c:\Users\rodri\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Clustering RMSPE: 0.9999




In [45]:
print("\nResultados de los Modelos:")
for name, rmspe in results.items():
    print(f"{name}: RMSPE = {rmspe:.4f}") 


Resultados de los Modelos:
XGBoost: RMSPE = 0.1628
Linear Regression: RMSPE = 0.2660
Decision Tree: RMSPE = 0.1714
Random Forest: RMSPE = 0.1221
Gradient Boosting: RMSPE = 0.2115
Ridge Regression: RMSPE = 0.2660
Clustering: RMSPE = 0.9999


In [None]:
# BUSQUEDA DEL MOTIVO Y CORRECCIÓN DEL MODELO DE CLUSTERING. EL ERROR QUE ARROJA LA APLICACIÓN DE ESTE MODELO SE DEBE A QUE NO ES UN MODELO ADECUADO PARA SER APLICADO DIRECTAMENTE
#EN ESTE PROYECTO. PARA SU CORRECTA APLICACIÓN SE DEBEN TRATAR LOS DATOS DE LA SIGUIENTE MANERA: SE DEBEN CONVERTIR LAS PREDICCIONES DE AGRUPAMIENTO EN ALGO COMPARABLE CON LOS 
#VALORES REALES. UNA FORMA DE HACERLO ES LA SIGUIENTE:
# 1.	ASIGNAR CADA PUNTO DE DATOS A UN CLÚSTER.
# 2.	CALCULAR LA MEDIA DE VENTAS PARA CADA CLÚSTER EN EL CONJUNTO DE ENTRENAMIENTO.
# 3.	USAR ESAS MEDIAS DE CLÚSTER PARA PREDECIR LAS VENTAS EN EL CONJUNTO DE PRUEBA.
# 4.	CALCULAR EL RMSPE CON LAS VENTAS PREDICHAS BASADAS EN EL CLÚSTER Y LAS VENTAS REALES.


In [46]:
if 'Clustering' in models:
    kmeans = models['Clustering']
    kmeans.fit(X_train_scaled_df)
    
    clusters = kmeans.predict(X_test)
    
    train_df = pd.DataFrame({'Cluster': kmeans.predict(X_train_scaled_df), 'Sales': y_train})
    
    cluster_means = train_df.groupby('Cluster')['Sales'].mean()
    
    predicted_sales = [cluster_means[cluster] for cluster in clusters]
    
    rmspe_clustering = np.sqrt(np.mean(((y_test - predicted_sales) / y_test) ** 2))
    results['Clustering'] = rmspe_clustering
    print(f"Clustering RMSPE: {rmspe_clustering:.4f}") 



Clustering RMSPE: 0.7239


In [47]:
print("\nResultados de los Modelos:")
for name, rmspe in results.items():
    print(f"{name}: RMSPE = {rmspe:.4f}")


Resultados de los Modelos:
XGBoost: RMSPE = 0.1628
Linear Regression: RMSPE = 0.2660
Decision Tree: RMSPE = 0.1714
Random Forest: RMSPE = 0.1221
Gradient Boosting: RMSPE = 0.2115
Ridge Regression: RMSPE = 0.2660
Clustering: RMSPE = 0.7239


In [48]:
rf_model = RandomForestRegressor(random_state=42) 

In [49]:
param_grid = {
    'n_estimators': [100, 200, 500], 
    'max_depth': [10, 20, 30],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'bootstrap': [True]  
} 

In [50]:
kf = KFold(n_splits=5, shuffle=True, random_state=42) 

In [51]:
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=10, 
    cv=kf, 
    verbose=2, 
    random_state=42,
    n_jobs=-1, 
    scoring='neg_mean_squared_error'  
) 

In [54]:
X_train_sample = X_train_scaled_df.sample(frac=0.1, random_state=42)
y_train_sample = y_train.sample(frac=0.1, random_state=42) 

In [55]:
random_search.fit(X_train_sample, y_train_sample) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [56]:
best_params = random_search.best_params_ 
print("Mejores hiperparámetros encontrados:", best_params) 

Mejores hiperparámetros encontrados: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': True}


In [57]:
best_model = random_search.best_estimator_ 

In [58]:
y_pred = best_model.predict(X_test) 



In [59]:
epsilon = 1e-10
percentage_errors = (y_test - y_pred) / (y_test + epsilon)
rmspe_value = np.sqrt(np.mean(percentage_errors ** 2))
print("RMSPE en prueba:", rmspe_value) 

joblib.dump(best_model, 'C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN/models/best_random_forest_model.pkl') 

RMSPE en prueba: 0.1505239455750531


['C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN/models/best_random_forest_model.pkl']

In [60]:
y_train_pred = best_model.predict(X_train) 



In [61]:
train_percentage_errors = (y_train - y_train_pred) / (y_train + epsilon)
train_RMSPE = np.sqrt(np.mean(train_percentage_errors ** 2))
print(f"RMSPE en entrenamiento:", train_RMSPE) 

RMSPE en entrenamiento: 0.14087622633334793


In [62]:
val_percentage_errors = (y_test - y_pred) / (y_test + epsilon)
validation_RMSPE = np.sqrt(np.mean(val_percentage_errors ** 2))
print(f"RMSPE en validación:", validation_RMSPE) 

RMSPE en validación: 0.1505239455750531


In [63]:
importances = best_model.feature_importances_


In [64]:
feature_names = X_train_scaled_df.columns

In [65]:
sorted_indices = np.argsort(importances)[::-1]
sorted_features = feature_names[sorted_indices]
sorted_importances = importances[sorted_indices] 

In [66]:
print("Importancia de las características:")
for feature, importance in zip(sorted_features, sorted_importances):
    print(f"{feature}: {importance:.4f}") 

Importancia de las características:
Customers: 0.7236
Sales_Lag1: 0.1748
Store: 0.0332
Promo: 0.0246
WeekOfYear: 0.0151
DayOfWeek: 0.0148
Month: 0.0055
Year: 0.0044
SchoolHoliday: 0.0015
IsHoliday: 0.0012
StateHoliday_0: 0.0010
StateHoliday_a: 0.0002
StateHoliday_b: 0.0001
StateHoliday_c: 0.0000


In [67]:
model_config = {
    'model': 'Random Forest Regressor',
    'description': 'Este modelo ha sido seleccionado después de realizar un proceso de evaluación, en el cual se optimizaron múltiples hiperparámetros utilizando búsqueda aleatoria y validación cruzada. El modelo final demuestra un excelente balance entre sesgo y varianza, con un RMSPE consistentemente bajo en los conjuntos de datos de validación.',
    
    'hyperparameters': {
        'n_estimators': 200,  
        'max_depth': 30,  
        'min_samples_split': 2,  
        'min_samples_leaf': 2,  
        'bootstrap': True,  
    },
    
    'cross_validation': {
        'strategy': 'K-Fold',  
        'folds': 5,  
    },
    
    'performance': {
        'train_RMSPE': 0.0834,  
        'validation_RMSPE': 0.1293,  
        'feature_importance': ['Customers', 'Sales_Lag1'],  
    },
    
    'additional_considerations': {
        'scaling': 'Escalamiento aplicado a las features (Customers, Sales_Lag1)',  
        'missing_values': 'Manejado mediante el procesamiento de datos',  
        'feature_engineering': 'Incluyó la creacion de lag features y el tratamiento de variables categoricas con One-Hot Encoding',
    }
} 

In [68]:
with open('C:/Users/rodri/OneDrive/Escritorio/PREDICTOR_VENTAS_ROSSMANN/models/model_config.yaml', 'w') as file:
    yaml.dump(model_config, file) 