In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

df = pd.read_csv('dairy_dataset.csv')

df['Price per Unit'] = df['Price per Unit'] * 0.06
df['Price per Unit (sold)'] = df['Price per Unit (sold)'] * 0.06

df['Date'] = pd.to_datetime(df['Date'])
df['Expiration Date'] = pd.to_datetime(df['Expiration Date'])

df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Day_of_Week'] = df['Date'].dt.dayofweek
df['Days_to_Expire'] = (df['Expiration Date'] - df['Date']).dt.days

size_map = {'Small': 0, 'Medium': 1, 'Large': 2}
df['Farm Size'] = df['Farm Size'].map(size_map)

cols_drop = ['Approx. Total Revenue(INR)', 'Total Value', 'Product ID', 
             'Date', 'Production Date', 'Expiration Date', 
             'Quantity in Stock (liters/kg)']
df_clean = df.drop(columns=cols_drop)

X = df_clean.drop('Quantity Sold (liters/kg)', axis=1)
y = df_clean['Quantity Sold (liters/kg)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ['Location', 'Product Name', 'Brand', 'Storage Condition', 
                        'Customer Location', 'Sales Channel']
numeric_features = ['Total Land Area (acres)', 'Number of Cows', 'Farm Size', 
                    'Quantity (liters/kg)', 'Price per Unit', 'Price per Unit (sold)', 
                    'Shelf Life (days)', 'Minimum Stock Threshold (liters/kg)', 
                    'Reorder Quantity (liters/kg)', 'Month', 'Day_of_Week', 
                    'Year', 'Days_to_Expire']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [10, 20, None]
}

grid = GridSearchCV(model_rf, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_

y_pred = best_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE Final: {rmse:.2f}")

joblib.dump(best_rf, 'modelo_final_laticinios.pkl')
print("Modelo salvo (Inglês/Reais)!")

RMSE Final: 165.09
Modelo salvo (Inglês/Reais)!
