<a href="https://colab.research.google.com/github/ryyutku/DSGP/blob/anuk/Modelling/Model%206/Demand_forecast_model_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Training model withoutdifferencing or any other augmentation and transformations**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
df = pd.read_csv('CIEC.csv')

In [None]:
df.columns

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
features = ['date','fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture']

In [None]:
df_original = df.copy()

## **Checking the time frame with the most columns available**

In [None]:
print(df.isnull().sum())

In [None]:
len(df.columns)

In [None]:
df.dtypes

# Exploratory Data Analysis

## **Outlier detection**

In [None]:
#Detecting outliers using Zscore
import pandas as pd
import numpy as np
from scipy.stats import zscore

def detect_outliers_zscore(df, feature, threshold=2, time_column='date'):
    df = df.copy()

    # Skip non-numeric columns
    if not np.issubdtype(df[feature].dtype, np.number):
        print(f"Skipping non-numeric column: {feature}")
        return None, df  # Skip non-numeric columns

    # Drop rows with missing values in the feature column
    df = df.dropna(subset=[feature])

    # Calculate Z-scores for the feature
    df['zscore'] = zscore(df[feature])

    # Identify anomalies based on the Z-score threshold
    df[feature + '_isanomaly'] = (df['zscore'].abs() > threshold)

    # Filter anomaly data
    anomalies = df[df[feature + '_isanomaly']]

    # Select numerical output
    anomalies = anomalies[[time_column, feature, 'zscore', feature + '_isanomaly']]

    return anomalies, df



In [None]:
processed_df = df.copy()

for feature in df.columns:
    print("----", feature, "----")
    anomalies, processed_df = detect_outliers_zscore(processed_df, feature=feature, threshold=2)

    if anomalies is not None:
        print(f"Feature: {feature}")
        print(anomalies.head())
    print()

In [None]:
processed_df.columns

### **Removing the outliers**

In [None]:
processed_df = processed_df[processed_df['New Vehicle Registrations_isanomaly'] == False]
processed_df = processed_df[processed_df['No.of Vessels Colombo_isanomaly'] == False]
processed_df = processed_df[processed_df['Imports of Refined Products_isanomaly'] == False]
processed_df = processed_df[processed_df['Tax Road Transport_isanomaly'] == False]
processed_df = processed_df[processed_df['Petrol User Price_isanomaly'] == False]
processed_df = processed_df[processed_df['Sales 90 Octane_isanomaly'] == False]

### **Test running a model**

In [None]:
# dropping cols
processed_df = processed_df.drop(columns=[col for col in processed_df.columns if '_isanomaly' in col])

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler

# Define features to scale (exclude 'date' and 'fuel_consumption' if it's the target)
features_to_scale = processed_df.drop(columns=['date', 'fuel_consumption']).columns

scaler = StandardScaler()
processed_df[features_to_scale] = scaler.fit_transform(processed_df[features_to_scale])

In [None]:
pip install pycaret

In [None]:
from pycaret.time_series import *

# Initialize PyCaret for time series
ts_exp = setup(
    processed_df,
    target='fuel_consumption',
    index='date',
    session_id=123
)

# Compare models
best_model = compare_models()


ValueError: You must pass a freq argument as current index has none.

## **ADF Test**

In [None]:
from statsmodels.tsa.stattools import adfuller

def stationary_test(df, feature):
    df = df.copy()
    result = adfuller(feature.dropna())  # ADF test result
    adf_results = {
        'ADF Statistic': result[0],
        'p-value': result[1],
        'Critical Values': result[4]
    }

    # Determine if series is stationary based on the ADF test
    if result[1] < 0.05:  # If p-value < 0.05, we reject the null hypothesis (non-stationary)
        is_stationary = True
    else:
        is_stationary = False

    # Checking if ADF statistic is less than critical values at 5% level
    if result[0] < adf_results['Critical Values']['5%']:
        stationary_check = True  # Rejects the null hypothesis
    else:
        stationary_check = False  # Fails to reject the null hypothesis

    # Output results with the determination of stationarity
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print(f"Critical Values: {result[4]}")
    print(f"Is Stationary Based on p-value: {is_stationary}")
    print(f"Is Stationary Based on ADF Statistic vs Critical Value: {stationary_check}")
    print()

    return adf_results, is_stationary, stationary_check

In [None]:
for feature in df:
  print(f"Performing stationary test for {feature}")
  adf_results = stationary_test(df,df[feature])
  print(adf_results)
  print()

Need to perform differences on the data as it is a non-stationary dataset

## **Rolling statistics**

In [None]:
import matplotlib.pyplot as plt

def rolling_stats(df, feature, window=4):
    df = df.copy()  # Ensure df is properly copied
    if df[feature].dtype not in ['int64', 'float64']:  # Skip non-numeric columns
        print(f"Skipping {feature}: Not a numeric column")
        return

    df[feature + "_rmean"] = df[feature].rolling(window=window).mean()

    # Plot original feature and rolling mean
    df[[feature, feature + "_rmean"]].plot(figsize=(6, 3), title="Rolling Mean of " + feature)
    plt.show()

In [None]:
# Iterate only over numeric columns
for feature in df.select_dtypes(include=['number']).columns:
    rolling_stats(df, feature)

## **ACF/PACF Test**

In [None]:
from statsmodels.tsa.stattools import acf,pacf
def acf_pacf(df,feature):
  acf_values = acf(df[feature].dropna(), nlags=10)
  pacf_values = pacf(df[feature].dropna(), nlags=10)

  acf_df = pd.DataFrame({'Lag':range(11), 'ACF':acf_values})
  pacf_df = pd.DataFrame({'Lag':range(11), 'PACF':pacf_values})
  print("ACF and PACF for ",feature)
  print("Autocorrelation Values:")
  print(acf_df)
  print("\nPartial Autocorrelation Values:")
  print(pacf_df)


In [None]:
for feature in df.columns:
  acf_pacf(df,feature)

## **ACF/PACF Strength test**

In [None]:
from statsmodels.tsa.stattools import acf, pacf
import numpy as np

def acf_pacf_strength(df, feature, nlags=10):
    acf_values = acf(df[feature].dropna(), nlags=nlags)
    pacf_values = pacf(df[feature].dropna(), nlags=nlags)

    # Compute how quickly ACF declines (higher means strong trend)
    acf_strength = np.sum(np.abs(acf_values[1:])) / nlags  # Ignore lag 0 (always 1)

    # Check if PACF drops off after the first lag (higher means trend)
    pacf_strength = abs(pacf_values[1])  # PACF at lag 1 shows direct correlation

    print(f"{feature}: ACF Strength = {acf_strength:.4f}, PACF Strength = {pacf_strength:.4f}")

    return acf_strength, pacf_strength

# Run for all columns
for feature in df.columns:
    acf_pacf_strength(df, feature)

## **Lag Analysis**

In [None]:
def lag_analysis(df,feature):
  df[feature+'lag1'] = df[feature].shift(1)

  # Show numerical correlation
  lag_corr = df[[feature,feature+'lag1']].corr().iloc[0,1]
  print("Correlation between",feature,"and its 1-day lag:",lag_corr)

In [None]:
for feature in df.columns:
  lag_analysis(df,feature)

In [None]:
df.columns

# Modelling

In [None]:
df.dropna(inplace=True)

In [None]:
# Extracting date feature
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday
df['quarter'] = df['date'].dt.quarter

df.drop('date',axis=1,inplace=True)

## **Scaling**

In [None]:
# Feature scaling
numerical_cols = df.select_dtypes(include=['float64','int64']).columns
numerical_cols = numerical_cols.drop('fuel_consumption')

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

fuel_scaler = StandardScaler()
df['fuel_consumption'] = fuel_scaler.fit_transform(df[['fuel_consumption']])

df.head()

In [None]:
df.to_csv("data.csv",index=False)

In [None]:
df.columns

In [None]:
# splitting data into feature and target variables
X = df.drop('fuel_consumption',axis=1)
y = df['fuel_consumption']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state= 42)
print("Training shape",X_train.shape)
print("Test df shape",X_test.shape)

In [None]:
# visualize correlation matrix
corr = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, cmap='coolwarm',fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import plotly.express as px

In [None]:
# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)


In [None]:
import plotly.graph_objects as go

# Using Random Forest (replace with y_pred_lr for Linear Regression)
predictions = y_pred_rf  # Replace with y_pred_lr if you're using Linear Regression

# Create traces for actual and predicted values
trace_actual = go.Scatter(
    x=y_test.index, y=y_test, mode='lines+markers', name='Actual Fuel Demand', line=dict(color='blue')
)

trace_predicted = go.Scatter(
    x=y_test.index, y=predictions, mode='lines+markers', name='Predicted Fuel Demand', line=dict(color='red', dash='dash')
)

# Create the layout for the plot
layout = go.Layout(
    title='Actual vs Predicted Fuel Demand (Test Set)',
    xaxis=dict(title='Index (Test Set)'),
    yaxis=dict(title='Fuel Demand'),
    showlegend=True
)

# Plot the figure
fig = go.Figure(data=[trace_actual, trace_predicted], layout=layout)
fig.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

# Print out all the metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')


In [None]:
# Inverse transform on predicted and actual values
predicted_values = fuel_scaler.inverse_transform(predictions.reshape(-1, 1))
actual_values = fuel_scaler.inverse_transform(y_test.values.reshape(-1, 1))


In [None]:
print(y_test.index)

In [None]:
# Retrieve corresponding dates from the original dataset
y_test_dates = df_original.loc[y_test.index, 'date']  # Replace 'df_original' with the original dataframe before dropping 'date'

# Convert predictions and actual values back to original scale
predictions = predicted_values.flatten()  # Use inverse-transformed predictions
actuals = actual_values.flatten()  # Use inverse-transformed actual values

# Create traces for actual and predicted values
trace_actual = go.Scatter(
    x=y_test_dates, y=actuals, mode='lines+markers', name='Actual Fuel Demand', line=dict(color='blue')
)

trace_predicted = go.Scatter(
    x=y_test_dates, y=predictions, mode='lines+markers', name='Predicted Fuel Demand', line=dict(color='red', dash='dash')
)

# Create the layout for the plot
layout = go.Layout(
    title='Actual vs Predicted Fuel Demand (Test Set)',
    xaxis=dict(title='Date', tickformat="%Y-%m-%d"),  # Format dates properly
    yaxis=dict(title='Fuel Demand (Original Scale)'),
    showlegend=True
)

# Plot the figure
fig = go.Figure(data=[trace_actual, trace_predicted], layout=layout)
fig.show()


In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add actual values as scatter plot (points only)
fig.add_trace(go.Scatter(
    x=y_test_dates, y=actuals, mode='markers',
    name='Actual Fuel Demand', marker=dict(color='blue', size=8, symbol='circle')
))

# Add predicted values as scatter plot (points only)
fig.add_trace(go.Scatter(
    x=y_test_dates, y=predictions, mode='markers',
    name='Predicted Fuel Demand', marker=dict(color='red', size=8, symbol='x')
))

# Update layout
fig.update_layout(title="Actual vs Predicted Fuel Demand (Test Set)",
                  xaxis_title="Date", yaxis_title="Fuel Demand",
                  xaxis=dict(showgrid=False),
                  yaxis=dict(showgrid=True, zeroline=False),
                  showlegend=True)

fig.show()


## **Predicting future demand**

In [None]:
# Save the trained model as fuel_demand.pkl
with open("fuel_demand.pkl", "wb") as f:
    pickle.dump(rf_model, f)

print("Model saved successfully!")

In [None]:
df.columns