<a href="https://colab.research.google.com/github/ryyutku/DSGP/blob/anuk/Modelling/Model%207/Demand_forecast_model_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pycaret



In [31]:
import pandas as pd
from pycaret.time_series import *
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [42]:
df = pd.read_csv('ciec_data.csv')

In [43]:
df.shape

(2820, 28)

In [44]:
df['date'] = pd.to_datetime(df['date'])

In [45]:
# finding the columns that have the values of data ranging from 2000 to 2014, 2004 to 2014

In [46]:
def filter_data(df, date, start_year, end_year, threshold=0.05):
  df_filtered = df[(df['date'].dt.year >= start_year) & (df['date'].dt.year <= end_year)]

  missing_ratio = df_filtered.isnull().mean()
  columns_to_keep = missing_ratio[missing_ratio <= threshold].index
  df_filtered = df_filtered[columns_to_keep]

  return df_filtered

In [47]:
# values from 2000 to 2014
df1 = filter_data(df, 'date', 2000, 2014)

In [48]:
df1.shape

(783, 15)

## **Training a model out of this**

### **For time series**

In [14]:
df = df1.sort_values(by='date').set_index('date')

In [22]:
# Initializing the pycaret time sries setup
setup(data=df1, target= 'fuel_consumption', session_id=123)

best_model = compare_models()
print(best_model)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,fuel_consumption
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(783, 15)"
5,Transformed data shape,"(783, 15)"
6,Transformed train set shape,"(782, 15)"
7,Transformed test set shape,"(1, 15)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,TT (Sec)
croston,Croston,0.8774,0.0863,0.0005,0.0005,0.0005,0.0005,0.0367


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

Croston()


In [28]:
# evaluate_model(best_model)

In [25]:
evaluation_results = pull()
print("Evaluation results:\n", evaluation_results)

Evaluation results:
            Model    MASE   RMSSE     MAE    RMSE    MAPE   SMAPE  TT (Sec)
croston  Croston  0.8774  0.0863  0.0005  0.0005  0.0005  0.0005    0.0367


In [38]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = df1
# Assuming df is your dataframe and fuel_consumption is your target column
train_size = int(len(df) * 0.8)  # Split data 80-20 (train-test)
train_df = df[:train_size]
test_df = df[train_size:]

y_train = train_df['fuel_consumption']
y_test = test_df['fuel_consumption']

# ARIMA Model
arima_model = ARIMA(y_train, order=(5, 1, 0))  # Adjust p, d, q for tuning
arima_model_fit = arima_model.fit()

# SARIMA Model
sarima_model = SARIMAX(y_train, order=(5, 1, 0), seasonal_order=(1, 1, 1, 12))  # Tune order
sarima_model_fit = sarima_model.fit()

# Croston Method - Custom Implementation
def croston_method(y_train, forecast_steps=12):
    # Croston's method assumes intermittent demand (non-zero values appear at random intervals)
    demand = y_train[y_train > 0]
    time = np.arange(len(demand))

    # Estimate demand and interval components
    model = ARIMA(demand, order=(1, 1, 0))
    model_fit = model.fit()

    # Forecast demand and interval separately
    demand_forecast = model_fit.forecast(steps=forecast_steps)

    return demand_forecast

# Apply Croston method (forecast 12 steps ahead)
croston_forecast = croston_method(y_train, forecast_steps=len(y_test))

# Forecast using the models
y_train_pred_arima = arima_model_fit.predict(start=0, end=len(y_train)-1)
y_train_pred_sarima = sarima_model_fit.predict(start=0, end=len(y_train)-1)

y_test_pred_arima = arima_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_test)-1)
y_test_pred_sarima = sarima_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_test)-1)

# Evaluate Models
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape

# Train and Test Evaluation
train_mae_arima, train_rmse_arima, train_mape_arima = evaluate_model(y_train, y_train_pred_arima)
train_mae_sarima, train_rmse_sarima, train_mape_sarima = evaluate_model(y_train, y_train_pred_sarima)
train_mae_croston, train_rmse_croston, train_mape_croston = evaluate_model(y_train, croston_forecast[:len(y_train)])

test_mae_arima, test_rmse_arima, test_mape_arima = evaluate_model(y_test, y_test_pred_arima)
test_mae_sarima, test_rmse_sarima, test_mape_sarima = evaluate_model(y_test, y_test_pred_sarima)
test_mae_croston, test_rmse_croston, test_mape_croston = evaluate_model(y_test, croston_forecast[len(y_train):])

# Print results
print("Training Set Metrics:")
print(f"ARIMA - MAE: {train_mae_arima}, RMSE: {train_rmse_arima}, MAPE: {train_mape_arima}%")
print(f"SARIMA - MAE: {train_mae_sarima}, RMSE: {train_rmse_sarima}, MAPE: {train_mape_sarima}%")
print(f"Croston - MAE: {train_mae_croston}, RMSE: {train_rmse_croston}, MAPE: {train_mape_croston}%")

print("\nTest Set Metrics:")
print(f"ARIMA - MAE: {test_mae_arima}, RMSE: {test_rmse_arima}, MAPE: {test_mape_arima}%")
print(f"SARIMA - MAE: {test_mae_sarima}, RMSE: {test_rmse_sarima}, MAPE: {test_mape_sarima}%")
print(f"Croston - MAE: {test_mae_croston}, RMSE: {test_rmse_croston}, MAPE: {test_mape_croston}%")


ValueError: Found input variables with inconsistent numbers of samples: [626, 157]

In [39]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Assuming df is your dataframe and 'fuel_consumption' is the target column
# Convert 'date' column to datetime if it's not already
df['date'] = pd.to_datetime(df['date'])

# Set the 'date' column as the index
df.set_index('date', inplace=True)

# Split the data into train and test sets (80% for training and 20% for testing)
train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

# Target variable: 'fuel_consumption'
y_train = train['fuel_consumption']
y_test = test['fuel_consumption']

# Train ARIMA model (choose (p,d,q) based on your data or through hyperparameter tuning)
arima_model = ARIMA(y_train, order=(5, 1, 0))  # Example (p=5, d=1, q=0)
arima_model_fit = arima_model.fit()

# Predict on training data
train_pred = arima_model_fit.predict(start=0, end=len(y_train)-1)

# Predict on testing data
test_pred = arima_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_test)-1)

# Evaluate model performance on training set
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape

# Calculate metrics for training set
train_mae, train_rmse, train_mape = evaluate_model(y_train, train_pred)

# Calculate metrics for testing set
test_mae, test_rmse, test_mape = evaluate_model(y_test, test_pred)

# Print results
print("Training Set Metrics:")
print(f"MAE: {train_mae}, RMSE: {train_rmse}, MAPE: {train_mape}%")

print("\nTesting Set Metrics:")
print(f"MAE: {test_mae}, RMSE: {test_rmse}, MAPE: {test_mape}%")

# Display the first few predictions for inspection
print("\nTraining Predictions:")
print(train_pred.head())

print("\nTesting Predictions:")
print(test_pred.head())


Training Set Metrics:
MAE: 0.0018194888180486036, RMSE: 0.03347711535356901, MAPE: 0.2161833190578769%

Testing Set Metrics:
MAE: 0.045337579617834356, RMSE: 0.046380041831102024, MAPE: 5.003885029773028%

Training Predictions:
2000-01-03    0.000
2000-01-10    0.827
2000-01-17    0.827
2000-01-24    0.827
2000-01-31    0.827
Freq: W-MON, Name: predicted_mean, dtype: float64

Testing Predictions:
2012-01-02    0.939
2012-01-09    0.939
2012-01-16    0.939
2012-01-23    0.939
2012-01-30    0.939
Freq: W-MON, Name: predicted_mean, dtype: float64


In [50]:
# Import the necessary library
from pycaret.regression import *

# Initialize PyCaret setup with your dataset and target column
# Replace 'fuel_demand' with your target variable column name
setup(data=df1, target='fuel_consumption', session_id=123)

# Compare all models available in PyCaret
# It will automatically train and compare several regression models
best_model = compare_models()

# View the results of the best model (based on R2 or other metrics)
print(best_model)

# Evaluate the best model
evaluate_model(best_model)

# Predict on the testing data (if you have a separate test set)
# Here, we're using the best model for predictions
predictions = predict_model(best_model)

# You can also check the performance metrics directly
# In this case, PyCaret will show the performance metrics like MAE, MSE, R2, etc.


Unnamed: 0,Description,Value
0,Session id,123
1,Target,fuel_consumption
2,Target type,Regression
3,Original data shape,"(783, 14)"
4,Transformed data shape,"(783, 14)"
5,Transformed train set shape,"(548, 14)"
6,Transformed test set shape,"(235, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0,0.0,0.0001,1.0,0.0001,0.0,0.128
xgboost,Extreme Gradient Boosting,0.0001,0.0,0.0002,0.9999,0.0001,0.0001,0.053
gbr,Gradient Boosting Regressor,0.0,0.0,0.0002,0.9999,0.0001,0.0,0.105
dt,Decision Tree Regressor,0.0,0.0,0.0001,0.9998,0.0001,0.0,0.035
lightgbm,Light Gradient Boosting Machine,0.0001,0.0,0.0002,0.9998,0.0001,0.0001,0.092
rf,Random Forest Regressor,0.0,0.0,0.0002,0.9998,0.0001,0.0,0.283
ada,AdaBoost Regressor,0.0055,0.0001,0.0079,0.962,0.0043,0.0065,0.039
lr,Linear Regression,0.0088,0.0001,0.0108,0.9318,0.0058,0.0103,0.793
br,Bayesian Ridge,0.0088,0.0001,0.0108,0.9318,0.0058,0.0103,0.024
knn,K Neighbors Regressor,0.007,0.0003,0.0161,0.8493,0.0085,0.0078,0.029


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

ExtraTreesRegressor(n_jobs=-1, random_state=123)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…