## Bibliotecas

In [None]:
%pip install pandas
%pip install plotly_express
%pip install missingno
%pip install scipy
%pip install scikit-learn

## Importações

In [2]:
from math import sqrt
import pandas as pd
from joblib import dump
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

## Carregamento de Dados

In [3]:
# Importação dos dados da Mobly
df = pd.read_csv("../documentos/outros/base_de_dados/mobly_data_official.csv", sep = ",")
df

Unnamed: 0,weekday_name,sku,unit_price,shipment_type,anchor_category,product_department,product_category,origin_country,process_costing,sku_color,price_status,winning_price,items_sold,avg_website_visits_last_week,stock_qty,month,day,year,value_dollar,rate_employability
0,6,549,1099.99,0,24,3,11,0,0,12,3,1488.07,0,17.285714,0,1,1,2020,4.0949,12.4
1,6,669,413.99,0,13,4,9,0,1,9,3,392.58,0,48.857143,56,1,1,2020,4.0949,12.4
2,6,745,949.99,0,10,4,12,0,1,17,3,1110.69,1,22.714286,0,1,1,2020,4.0949,12.4
3,6,129,2949.99,0,24,4,11,0,1,21,3,3457.04,0,2.800000,0,1,1,2020,4.0949,12.4
4,6,697,657.99,0,17,4,9,0,1,31,2,529.00,0,0.000000,0,1,1,2020,4.0949,12.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740089,1,48,504.88,0,2,4,9,0,1,6,2,339.00,0,41.000000,61,7,3,2023,4.8222,8.0
740090,1,182,1717.95,0,23,4,11,0,1,12,4,1717.95,0,154.142857,1,7,3,2023,4.8222,8.0
740091,1,109,1659.88,0,5,4,9,0,1,22,2,1093.68,0,138.000000,16,7,3,2023,4.8222,8.0
740092,1,184,909.96,0,9,4,12,0,1,5,2,899.96,0,32.714286,20,7,3,2023,4.8222,8.0


## Agrupamento dos dados por semana

In [4]:
# Cria uma nova coluna "week" no DataFrame
df['week'] = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce').dt.strftime('%U')

# Converte o resultado para numérico
df['week'] = pd.to_numeric(df['week'], errors='coerce')

# Calcula o deslocamento necessário para ajustar as semanas desde janeiro de 2020
start_date = pd.Timestamp('2020-01-01')
df['week'] = ((pd.to_datetime(df[['year', 'month', 'day']]) - start_date).dt.days // 7) + 1

# Verifica o novo DataFrame
df

Unnamed: 0,weekday_name,sku,unit_price,shipment_type,anchor_category,product_department,product_category,origin_country,process_costing,sku_color,...,winning_price,items_sold,avg_website_visits_last_week,stock_qty,month,day,year,value_dollar,rate_employability,week
0,6,549,1099.99,0,24,3,11,0,0,12,...,1488.07,0,17.285714,0,1,1,2020,4.0949,12.4,1
1,6,669,413.99,0,13,4,9,0,1,9,...,392.58,0,48.857143,56,1,1,2020,4.0949,12.4,1
2,6,745,949.99,0,10,4,12,0,1,17,...,1110.69,1,22.714286,0,1,1,2020,4.0949,12.4,1
3,6,129,2949.99,0,24,4,11,0,1,21,...,3457.04,0,2.800000,0,1,1,2020,4.0949,12.4,1
4,6,697,657.99,0,17,4,9,0,1,31,...,529.00,0,0.000000,0,1,1,2020,4.0949,12.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740089,1,48,504.88,0,2,4,9,0,1,6,...,339.00,0,41.000000,61,7,3,2023,4.8222,8.0,183
740090,1,182,1717.95,0,23,4,11,0,1,12,...,1717.95,0,154.142857,1,7,3,2023,4.8222,8.0,183
740091,1,109,1659.88,0,5,4,9,0,1,22,...,1093.68,0,138.000000,16,7,3,2023,4.8222,8.0,183
740092,1,184,909.96,0,9,4,12,0,1,5,...,899.96,0,32.714286,20,7,3,2023,4.8222,8.0,183


## Modelagem

#### Preparação dos dados


In [5]:
# Definição da coluna que se quer prever
X = df.drop(['items_sold'], axis=1)
y = df['items_sold']

# Definição dos dados de treino e os de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Random Forest com Random Search (6 iterações)

In [6]:
# Grade de hiperparâmetros
param_rf = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Definindo o Random Forest
model_rf = RandomForestRegressor(random_state=42)

In [7]:
# Aplica o pipeline ao modelo Random Forest
rf_pipeline = Pipeline([
    ('model', model_rf)  
])

# Cria o objeto RandomizedSearchCV com o pipeline
random_6 = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_rf,
    n_iter=6,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# Treina o modelo com os melhores hiperparâmetros
random_6.fit(X_train, y_train)

# Faz a previsão com os melhores hiperparâmetros
best_rf = random_6.best_estimator_
y_pred_random_6 = best_rf.predict(X_test)

In [8]:
# Avaliando com a otimização por random search
mse_random_6 = mean_squared_error(y_test, y_pred_random_6)
r2_random_6 = r2_score(y_test, y_pred_random_6)
mae_random_6 = mean_absolute_error(y_test, y_pred_random_6)
rmse_random_6 = sqrt(mse_random_6 )

print(f"Erro Absoluto Médio (MAE): {mae_random_6}")
print(f"Erro Quadrático Médio (MSE): {mse_random_6}")
print(f"Raiz do Erro Quadrático Médio (RMSE): {rmse_random_6}")
print(f"R2 Score: {r2_random_6}")

Erro Absoluto Médio (MAE): 1.4409560567980622
Erro Quadrático Médio (MSE): 8.148012957097013
Raiz do Erro Quadrático Médio (RMSE): 2.854472448123648
R2 Score: 0.7342191364892479


## Exportando o modelo final

Armazenando o modelo para uso futuro. Isso será útil para preservar informações complexas para uso posterior no código. 

In [11]:
dump(best_rf, '../notebooks/outros/modelos/mobly-model-rf_v1.joblib')

['mobly-model-rf_v1.joblib']