<a href="https://colab.research.google.com/github/ryyutku/DSGP/blob/anuk/Modelling/Model%203/Demand_forecast_model_3_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import statsmodels.api as sm
from scipy.stats import linregress
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import curve_fit

In [182]:
files = ["Avg_Daily_Sales_Litres_ceypetco.csv","Avg_Daily_Sales_MT_ceypetco.csv","CIEC Data.csv","GDP_historical_data.csv","Import_Data_WITS.csv","Population_colombo.csv","Sales_IOC.csv","Transport_Report_Tables.csv" ]

In [183]:
ceypetco_sales_l_df = pd.read_csv(files[0]) # int64
ceypetco_sales_m_df = pd.read_csv(files[1]) # int64
ciec_df = pd.read_csv(files[2]) # float64
gdp_df = pd.read_csv(files[3]) # converted to float64
import_df = pd.read_csv(files[4]) # contains float64, int64 for years, object for the partner names and countries
population_df = pd.read_csv(files[5]) # converted float64
ioc_sales_df = pd.read_csv(files[6]) # converted to float64
transport_df = pd.read_csv(files[7]) # converted to float64

In [184]:
# Converting the ioc_sales data into float
ioc_sales_df = ioc_sales_df.replace({r'[\n,]': ''}, regex=True)
ioc_sales_df = ioc_sales_df.astype(float)

In [185]:
# Converting the gdp historical data into float
gdp_df = gdp_df.replace({r'[$%B,]':''},regex=True)
gdp_df = gdp_df.replace('',np.nan)
gdp_df = gdp_df.astype(float)

In [186]:
# converting the population df to float
population_df = population_df.replace({r'[,%]':''},regex=True)
population_df = population_df.replace('',np.nan)
population_df = population_df.astype(float)

In [187]:
# converting the transport df to float
transport_df_str = transport_df.iloc[:,0:2]
transport_df_n = transport_df.iloc[:, 2:].replace({r'[,%]':''},regex=True)
transport_df_n = transport_df_n.astype(float)
transport_df = transport_df_str.join(transport_df_n)

In [188]:
ciec_df['date'] = pd.to_datetime((ciec_df['date']))

In [189]:
ciec_df.columns

Index(['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture'],
      dtype='object')

In [190]:
ciec_df_cols = ['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture']

In [191]:
ciec_df.dtypes

Unnamed: 0,0
date,datetime64[ns]
fuel_consumption,float64
petroleum_imports_crudeOil,float64
Taxes_on_Customs_and_Other_Import Duties,float64
Foreign Direct Investments,float64
GDP Goods and Services,float64
GDP: Gross National Income,float64
Government Debt,float64
New Vehicle Registrations,float64
Vehicle Sales,float64


## **Checking the relationships of the columns to see how they must be interpolated**

Checking how the columns are related to other columns, For forecasting the null values

In [192]:
def calculate_category_correlation(df,feature1, feature2):
    # Drop rows where 'feature1' is NaN
    df_valid_feature1 = df.dropna(subset=[feature1])

    # Drop rows where 'feature2' are NaN
    df_valid_feature2 = df.dropna(subset=[feature2], how='all')

    # Find the overlapping date range where both 'feature1' and the 'feature2' exist
    start_date = max(df_valid_feature1['date'].min(), df_valid_feature2['date'].min())
    end_date = min(df_valid_feature1['date'].max(), df_valid_feature2['date'].max())

    # Filter data to include only the overlapping date range
    df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    # Compute correlation between 'fuel_consumption' and the aggregated category
    correlation = df_filtered[[feature1, feature2]].corr().iloc[0, 1]

    return correlation

In [193]:
def find_strong_correlations(df, columns, threshold=0.8):
  strong_correlations = {}
  for i in columns:
    for j in columns:
      if i != j:
        correlation = calculate_category_correlation(df, i, j)
        if abs(correlation) >= threshold:
          if i not in strong_correlations:
            strong_correlations[i] = []
          strong_correlations[i].append((j,correlation))


  if strong_correlations:
    print("THE STRONG CORRELATIONS")
    for feature, related_features  in strong_correlations.items():
      for related_feature, corr in related_features:
        print(f"{feature} and {related_feature} have a correlation of {corr:.2f}")
  else:
    print("NO STRONG CORRELATIONS")

  return strong_correlations


In [194]:
strong_correlations = find_strong_correlations(ciec_df,ciec_df_cols)

THE STRONG CORRELATIONS
date and fuel_consumption have a correlation of 0.90
date and GDP Goods and Services have a correlation of 0.91
date and GDP: Gross National Income have a correlation of 0.93
date and Government Debt have a correlation of 0.94
date and Imports of Refined Products have a correlation of 0.88
date and Tax income profits_gains have a correlation of 0.81
date and Tax Goods & Services have a correlation of -0.93
date and GDP FCE Households have a correlation of 0.89
date and Petrol User Price have a correlation of -0.87
date and Consumption_Oil have a correlation of 0.94
date and Sales 90 Octane have a correlation of 0.96
date and Sales 95 Octane have a correlation of 0.83
date and Sales Auto Diesel have a correlation of 0.81
date and Household_income have a correlation of 0.82
fuel_consumption and date have a correlation of 0.90
fuel_consumption and Vehicle Sales have a correlation of 0.83
fuel_consumption and Petrol User Price have a correlation of -0.84
fuel_consum

In [195]:
ciec_df.isnull().mean() *100

Unnamed: 0,0
date,0.0
fuel_consumption,28.649535
petroleum_imports_crudeOil,8.74795
Taxes_on_Customs_and_Other_Import Duties,3.007108
Foreign Direct Investments,37.288136
GDP Goods and Services,57.900492
GDP: Gross National Income,57.900492
Government Debt,69.327501
New Vehicle Registrations,0.601422
Vehicle Sales,57.189721


**Null value analysis**\
0-10% - date, consumption_oil

10-30% - fuel_consumption, New Vehicle Registrations (Interpolation will be applied)

30-50% -petroleum_imports_crude_oil, Taxes_on_Customs_and_Other_Import Duties, Imports of Refined Products, Tax income profits_gains, Tax Goods & Services, Diesel User Price, Petrol User Price, Sales 90 Octane, Sales Auto Diesel,

50% + - Foreign Direct Investments, GDP Goods and Services, GDP: Gross National Income, GDP: Gross National Income, Government Debt, Vehicle Sales, Port Stay Duration, Vehicle Sales Asia, No.of Vessels Colombo, Colombo port calls, Tax on Export, Tax Road Transport, GDP FCE Households, Sales 95 Octane, Household_income, Fuel_other_manufacture. (Interplation will be applied)

**Need to cutdown the range from which data is considered, few possible time frames to start with are 1990,1999,2005,2006,2010 **

In [196]:
def get_correlating_list(feature, dict):
  list = []
  values = dict[feature]
  for v1,v2 in values:
    list.append(v1)
  return list

In [197]:
proxy_df = pd.DataFrame()

In [198]:
# Making a copy of ciec_df
df = ciec_df.copy()

## **Scaling The data**

In [199]:
scaler = StandardScaler()
m_scaler = MinMaxScaler()

Using z-score scaling (Standard Scaler) for the columns following normal distribution

In [200]:
z_cols = ['fuel_consumption', 'GDP: Gross National Income', 'GDP Goods and Services',
          'Foreign Direct Investments', 'Household_income', 'Tax income profits_gains',
          'Tax on Export', 'Tax Goods & Services', 'Tax Road Transport', 'Government Debt', 'Vehicle Sales']

In [201]:
df[z_cols] = scaler.fit_transform(ciec_df[z_cols])

Using min-max scaling for price related values

In [202]:
m_cols = ['Diesel User Price', 'Petrol User Price', 'petroleum_imports_crudeOil','Imports of Refined Products', 'Consumption_Oil']

In [203]:
df[m_cols] = m_scaler.fit_transform(ciec_df[m_cols])

Adding log transformation for highly skewed data

In [204]:
l_cols = ['GDP: Gross National Income', 'Foreign Direct Investments', 'Household_income', 'Tax income profits_gains']

In [205]:
df[l_cols] = np.log1p(df[l_cols])

  result = func(self.values, **kwargs)


## **Building Proxy features for the columns using the columns inside the df**

In [206]:
# from sklearn.linear_model import LinearRegression
# import numpy as np
# import pandas as pd

# def fill_missing_with_regression(df, target_feature, correlated_features):
#     df = df.copy()  # Avoid modifying original DataFrame

#     if 'date' in correlated_features:
#       correlated_features.remove('date')

#     # Find missing values in the target column
#     missing_mask = df[target_feature].isna()

#     # Drop rows where either the target or any correlated feature is NaN
#     valid_data = df.dropna(subset=[target_feature] + correlated_features)

#     if valid_data.empty:
#         print(f"Not enough valid data to train regression for {target_feature}.")
#         return df  # Return original DF if no valid training data

#     # Define X (independent variables) and y (dependent variable)
#     X_train = valid_data[correlated_features]
#     y_train = valid_data[target_feature]

#     # Train regression model
#     model = LinearRegression()
#     model.fit(X_train, y_train)

#     # Find rows where target_feature is missing but correlated features exist
#     X_missing = df.loc[missing_mask, correlated_features].dropna()

#     if X_missing.empty:
#         print(f"No valid correlated features available to predict missing {target_feature}.")
#         return df  # No valid data to fill missing values

#     # Predict missing values
#     predicted_values = model.predict(X_missing)

#     # Fill missing values with predicted values
#     df.loc[X_missing.index, target_feature] = predicted_values

#     return df



In [207]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def fill_missing_with_regression(df, target_feature, correlated_features):
    df = df.copy()  # Avoid modifying original DataFrame

    # Ensure 'date' column is not used
    correlated_features = [col for col in correlated_features if col != 'date']

    # Check if correlated features exist
    if len(correlated_features) == 0:
        print(f"No valid correlated features for {target_feature}.")
        return df

    # Find missing values in the target column
    missing_mask = df[target_feature].isna()

    # Drop rows where either the target or any correlated feature is NaN
    valid_data = df.dropna(subset=[target_feature] + correlated_features)

    # Debug: Print how many rows are valid
    print(f"Valid data available for {target_feature}: {len(valid_data)} rows")

    if valid_data.empty:
        print(f"Not enough valid data to train regression for {target_feature}.")
        return df  # Return original DF if no valid training data

    # Fill missing values in correlated features (forward + backward fill)
    df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')

    # Define X (independent variables) and y (dependent variable)
    X_train = valid_data[correlated_features]
    y_train = valid_data[target_feature]

    # Convert data types to float
    X_train = X_train.astype(float)
    y_train = y_train.astype(float)

    # Train regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Find rows where target_feature is missing but correlated features exist
    X_missing = df.loc[missing_mask, correlated_features].dropna()

    # Debug: Print how many missing rows can be filled
    print(f"Missing values to predict for {target_feature}: {len(X_missing)} rows")

    if X_missing.empty:
        print(f"No valid correlated features available to predict missing {target_feature}.")
        return df  # No valid data to fill missing values

    # Convert missing data to float
    X_missing = X_missing.astype(float)

    # Predict missing values
    predicted_values = model.predict(X_missing)

    # Fill missing values with predicted values
    df.loc[X_missing.index, target_feature] = predicted_values

    return df


In [208]:
# Filling missing values for 'fuel_consumption'
fuel_consumption_list = get_correlating_list('fuel_consumption', strong_correlations)
df = fill_missing_with_regression(df, 'fuel_consumption', fuel_consumption_list[:5])

Valid data available for fuel_consumption: 522 rows
Missing values to predict for fuel_consumption: 524 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [209]:
ciec_df['fuel_consumption'].corr(df['fuel_consumption'])

0.9999999999999998

In [210]:
df['fuel_consumption'].isnull().mean()

0.0

In [211]:
# Filling missing values for 'GDP Goods and Services'
GDP_Goods_Services_list = get_correlating_list('GDP Goods and Services', strong_correlations)
df = fill_missing_with_regression(df, 'GDP Goods and Services', GDP_Goods_Services_list[:5])

Valid data available for GDP Goods and Services: 521 rows
Missing values to predict for GDP Goods and Services: 1059 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [212]:
ciec_df['GDP Goods and Services'].corr(df['GDP Goods and Services'])

1.0

In [213]:
df['GDP Goods and Services'].isnull().mean()

0.0

In [214]:
# Filling missing values for 'GDP: Gross National Income'
GDP_Gross_National_Income_list = get_correlating_list('GDP: Gross National Income', strong_correlations)
df = fill_missing_with_regression(df, 'GDP: Gross National Income', GDP_Gross_National_Income_list[:5])

  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


Valid data available for GDP: Gross National Income: 1829 rows
Missing values to predict for GDP: Gross National Income: 0 rows
No valid correlated features available to predict missing GDP: Gross National Income.


In [215]:
ciec_df['GDP: Gross National Income'].corr(df['GDP: Gross National Income'])

0.9050850073862409

In [216]:
df['GDP: Gross National Income'].isnull().mean()

0.0

In [217]:
# Filling values for 'Government Debt'
Government_Debt_list = get_correlating_list('Government Debt', strong_correlations)
df = fill_missing_with_regression(df, 'Government Debt', Government_Debt_list[:5])

Valid data available for Government Debt: 1818 rows
Missing values to predict for Government Debt: 0 rows
No valid correlated features available to predict missing Government Debt.


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [218]:
ciec_df['Government Debt'].corr(df['Government Debt'])

1.0

In [219]:
df['Government Debt'].isnull().mean()

0.0

In [220]:
# Filling for 'Vehicle Sales'
Vehicle_Sales_list = get_correlating_list('Vehicle Sales', strong_correlations)
df = fill_missing_with_regression(df, 'Vehicle Sales', Vehicle_Sales_list[:5])

Valid data available for Vehicle Sales: 105 rows
Missing values to predict for Vehicle Sales: 0 rows
No valid correlated features available to predict missing Vehicle Sales.


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [221]:
ciec_df['Vehicle Sales'].corr(df['Vehicle Sales'])

1.0

In [222]:
df['Vehicle Sales'].isnull().mean()

0.0

In [223]:
# For 'Tax on Export'
tax_on_export_list = get_correlating_list('Tax on Export', strong_correlations)
df = fill_missing_with_regression(df,'Tax on Export',tax_on_export_list[:5])

Valid data available for Tax on Export: 1829 rows
Missing values to predict for Tax on Export: 0 rows
No valid correlated features available to predict missing Tax on Export.


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [224]:
df['Tax on Export'].isnull().mean()

0.0

In [225]:
ciec_df['Tax on Export'].corr(df['Tax on Export'])

1.0

In [226]:
# for 'Tax on Road Transport'
tax_road_transport_list = get_correlating_list('Tax Road Transport', strong_correlations)
df = fill_missing_with_regression(df,'Tax Road Transport',tax_road_transport_list[:5])

Valid data available for Tax Road Transport: 887 rows
Missing values to predict for Tax Road Transport: 942 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [227]:
df['Tax Road Transport'].isnull().mean()

0.0

In [228]:
df['Tax Road Transport'].corr(ciec_df['Tax Road Transport'])

1.0

'Tax Road Transport' still has a high values

In [229]:
# GDP FCE Households
gdp_fce_households_list = get_correlating_list('GDP FCE Households', strong_correlations)
df = fill_missing_with_regression(df,'GDP FCE Households',gdp_fce_households_list[:5])

Valid data available for GDP FCE Households: 1829 rows
Missing values to predict for GDP FCE Households: 0 rows
No valid correlated features available to predict missing GDP FCE Households.


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [230]:
ciec_df['GDP FCE Households'].corr(df['GDP FCE Households'])

1.0

In [231]:
df['GDP FCE Households'].isnull().mean()

0.0

In [232]:
# For 'Sales 95 Octane'
sales_95_octane_list = get_correlating_list('Sales 95 Octane', strong_correlations)
df = fill_missing_with_regression(df,'Sales 95 Octane',sales_95_octane_list[:5])

Valid data available for Sales 95 Octane: 1252 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


Missing values to predict for Sales 95 Octane: 577 rows


In [233]:
df['Sales 95 Octane'].isnull().mean()

0.0

In [234]:
ciec_df['Sales 95 Octane'].corr(df['Sales 95 Octane'])


1.0

In [235]:
# For 'Fuel and other manufacture'
fuel_other_manufacture_list = get_correlating_list('Fuel_other_manufacture', strong_correlations)
df = fill_missing_with_regression(df,'Fuel_other_manufacture',fuel_other_manufacture_list[:5])

Valid data available for Fuel_other_manufacture: 569 rows
Missing values to predict for Fuel_other_manufacture: 1260 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [236]:
df['Fuel_other_manufacture'].isnull().mean()

0.0

In [237]:
ciec_df['Fuel_other_manufacture'].corr(df['Fuel_other_manufacture'])

0.9999999999999998

In [238]:
# For 'petroleum_import_crudeOil'
petroleum_imports_crudeOil_list = get_correlating_list('petroleum_imports_crudeOil', strong_correlations)
df = fill_missing_with_regression(df,'petroleum_imports_crudeOil',petroleum_imports_crudeOil_list[:5])

Valid data available for petroleum_imports_crudeOil: 1617 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


Missing values to predict for petroleum_imports_crudeOil: 160 rows


In [239]:
ciec_df['petroleum_imports_crudeOil'].corr(df['petroleum_imports_crudeOil'])

0.9999999999999994

In [240]:
df['petroleum_imports_crudeOil'].isnull().mean()

0.0

In [251]:
# For 'Imports of Refined Products'
Imports_of_Refined_Products_list = get_correlating_list('Imports of Refined Products', strong_correlations)
df = fill_missing_with_regression(df,'Imports of Refined Products',Imports_of_Refined_Products_list[:5])

Valid data available for Imports of Refined Products: 1669 rows
Missing values to predict for Imports of Refined Products: 160 rows


  df[correlated_features] = df[correlated_features].fillna(method='ffill').fillna(method='bfill')


In [252]:
ciec_df['Imports of Refined Products'].corr(df['Imports of Refined Products'])

1.0

#### **Summary of performing regression filling**

In [253]:
df.isnull().mean() *100

Unnamed: 0,0
date,0.0
fuel_consumption,0.0
petroleum_imports_crudeOil,0.0
Taxes_on_Customs_and_Other_Import Duties,3.007108
Foreign Direct Investments,42.974303
GDP Goods and Services,0.0
GDP: Gross National Income,0.0
Government Debt,0.0
New Vehicle Registrations,0.0
Vehicle Sales,0.0


The following columns doesnt have any strong correlations \
'Foreign Direct Investments' \
'Household_income' \
'Colombo port calls' \
'Port Stay Duration' \
'No.of Vessels Colomob' \

## **Building a Demand Proxy Target Variable**

## **Analyzing Data**