<a href="https://colab.research.google.com/github/ryyutku/DSGP/blob/anuk/Modelling/Model%203/Demand_forecast_model_3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import statsmodels.api as sm
from scipy.stats import linregress
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.optimize import curve_fit

In [None]:
files = ["Avg_Daily_Sales_Litres_ceypetco.csv","Avg_Daily_Sales_MT_ceypetco.csv","CIEC Data.csv","GDP_historical_data.csv","Import_Data_WITS.csv","Population_colombo.csv","Sales_IOC.csv","Transport_Report_Tables.csv" ]

In [None]:
ceypetco_sales_l_df = pd.read_csv(files[0]) # int64
ceypetco_sales_m_df = pd.read_csv(files[1]) # int64
ciec_df = pd.read_csv(files[2]) # float64
gdp_df = pd.read_csv(files[3]) # converted to float64
import_df = pd.read_csv(files[4]) # contains float64, int64 for years, object for the partner names and countries
population_df = pd.read_csv(files[5]) # converted float64
ioc_sales_df = pd.read_csv(files[6]) # converted to float64
transport_df = pd.read_csv(files[7]) # converted to float64

In [None]:
# Converting the ioc_sales data into float
ioc_sales_df = ioc_sales_df.replace({r'[\n,]': ''}, regex=True)
ioc_sales_df = ioc_sales_df.astype(float)

In [None]:
# Converting the gdp historical data into float
gdp_df = gdp_df.replace({r'[$%B,]':''},regex=True)
gdp_df = gdp_df.replace('',np.nan)
gdp_df = gdp_df.astype(float)

In [None]:
# converting the population df to float
population_df = population_df.replace({r'[,%]':''},regex=True)
population_df = population_df.replace('',np.nan)
population_df = population_df.astype(float)

In [None]:
# converting the transport df to float
transport_df_str = transport_df.iloc[:,0:2]
transport_df_n = transport_df.iloc[:, 2:].replace({r'[,%]':''},regex=True)
transport_df_n = transport_df_n.astype(float)
transport_df = transport_df_str.join(transport_df_n)

In [None]:
ciec_df['date'] = pd.to_datetime((ciec_df['date']))

In [None]:
ciec_df.columns

Index(['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties ',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture'],
      dtype='object')

In [None]:
ciec_df_cols = ['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties ',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture']

In [None]:
ciec_df.dtypes

Unnamed: 0,0
date,datetime64[ns]
fuel_consumption,float64
petroleum_imports_crudeOil,float64
Taxes_on_Customs_and_Other_Import Duties,float64
Foreign Direct Investments,float64
GDP Goods and Services,float64
GDP: Gross National Income,float64
Government Debt,float64
New Vehicle Registrations,float64
Vehicle Sales,float64


## **Checking the relationships of the columns to see how they must be interpolated**

Checking how the columns are related to other columns, For forecasting the null values

In [None]:
def calculate_category_correlation(df,feature1, feature2):
    # Drop rows where 'feature1' is NaN
    df_valid_feature1 = df.dropna(subset=[feature1])

    # Drop rows where 'feature2' are NaN
    df_valid_feature2 = df.dropna(subset=[feature2], how='all')

    # Find the overlapping date range where both 'feature1' and the 'feature2' exist
    start_date = max(df_valid_feature1['date'].min(), df_valid_feature2['date'].min())
    end_date = min(df_valid_feature1['date'].max(), df_valid_feature2['date'].max())

    # Filter data to include only the overlapping date range
    df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    # Compute correlation between 'fuel_consumption' and the aggregated category
    correlation = df_filtered[[feature1, feature2]].corr().iloc[0, 1]

    return correlation

In [None]:
def find_strong_correlations(df, columns, threshold=0.8):
  strong_correlations = {}
  for i in columns:
    for j in columns:
      if i != j:
        correlation = calculate_category_correlation(df, i, j)
        if abs(correlation) >= threshold:
          if i not in strong_correlations:
            strong_correlations[i] = []
          strong_correlations[i].append((j,correlation))


  if strong_correlations:
    print("THE STRONG CORRELATIONS")
    for feature, related_features  in strong_correlations.items():
      for related_feature, corr in related_features:
        print(f"{feature} and {related_feature} have a correlation of {corr:.2f}")
  else:
    print("NO STRONG CORRELATIONS")

  return strong_correlations


In [None]:
strong_correlations = find_strong_correlations(ciec_df,ciec_df_cols)

THE STRONG CORRELATIONS
date and fuel_consumption have a correlation of 0.90
date and GDP Goods and Services have a correlation of 0.91
date and GDP: Gross National Income have a correlation of 0.93
date and Government Debt have a correlation of 0.94
date and Imports of Refined Products have a correlation of 0.88
date and Tax income profits_gains have a correlation of 0.81
date and Tax Goods & Services have a correlation of -0.93
date and GDP FCE Households have a correlation of 0.89
date and Petrol User Price have a correlation of -0.87
date and Consumption_Oil have a correlation of 0.94
date and Sales 90 Octane have a correlation of 0.96
date and Sales 95 Octane have a correlation of 0.83
date and Sales Auto Diesel have a correlation of 0.81
date and Household_income have a correlation of 0.82
fuel_consumption and date have a correlation of 0.90
fuel_consumption and Vehicle Sales have a correlation of 0.83
fuel_consumption and Petrol User Price have a correlation of -0.84
fuel_consum

In [None]:
ciec_df.isnull().mean() *100

Unnamed: 0,0
date,0.0
fuel_consumption,28.649535
petroleum_imports_crudeOil,8.74795
Taxes_on_Customs_and_Other_Import Duties,3.007108
Foreign Direct Investments,37.288136
GDP Goods and Services,57.900492
GDP: Gross National Income,57.900492
Government Debt,69.327501
New Vehicle Registrations,0.601422
Vehicle Sales,57.189721


**Null value analysis**\
0-10% - date, consumption_oil

10-30% - fuel_consumption, New Vehicle Registrations (Interpolation will be applied)

30-50% -petroleum_imports_crude_oil, Taxes_on_Customs_and_Other_Import Duties, Imports of Refined Products, Tax income profits_gains, Tax Goods & Services, Diesel User Price, Petrol User Price, Sales 90 Octane, Sales Auto Diesel,

50% + - Foreign Direct Investments, GDP Goods and Services, GDP: Gross National Income, GDP: Gross National Income, Government Debt, Vehicle Sales, Port Stay Duration, Vehicle Sales Asia, No.of Vessels Colombo, Colombo port calls, Tax on Export, Tax Road Transport, GDP FCE Households, Sales 95 Octane, Household_income, Fuel_other_manufacture. (Interplation will be applied)

**Need to cutdown the range from which data is considered, few possible time frames to start with are 1990,1999,2005,2006,2010 **

In [None]:
def get_correlating_list(feature, dict):
  list = []
  values = dict[feature]
  for v1,v2 in values:
    list.append(v1)
  return list

In [None]:
proxy_df = pd.DataFrame()

In [None]:
# Making a copy of ciec_df
df = ciec_df.copy()

## **Scaling The data**

In [None]:
scaler = StandardScaler()
m_scaler = MinMaxScaler()

Using z-score scaling (Standard Scaler) for the columns following normal distribution

In [None]:
z_cols = ['fuel_consumption', 'GDP: Gross National Income', 'GDP Goods and Services',
          'Foreign Direct Investments', 'Household_income', 'Tax income profits_gains',
          'Tax on Export', 'Tax Goods & Services', 'Tax Road Transport', 'Government Debt', 'Vehicle Sales']

In [None]:
df[z_cols] = scaler.fit_transform(ciec_df[z_cols])

Using min-max scaling for price related values

In [None]:
m_cols = ['Diesel User Price', 'Petrol User Price', 'petroleum_imports_crudeOil','Imports of Refined Products', 'Consumption_Oil']

In [None]:
df[m_cols] = m_scaler.fit_transform(ciec_df[m_cols])

Adding log transformation for highly skewed data

In [None]:
l_cols = ['GDP: Gross National Income', 'Foreign Direct Investments', 'Household_income', 'Tax income profits_gains']

In [None]:
df[l_cols] = np.log1p(df[l_cols])

  result = func(self.values, **kwargs)


## **Building Proxy features for the columns using the columns inside the df**

Proxy for fuel consumption

In [None]:
# Fuel consumption
fuel_consumption_list = get_correlating_list('fuel_consumption',strong_correlations)


In [None]:
# Making a proxy value
df['fuel_consumption'] = df['fuel_consumption'].fillna(
    (15 * df[fuel_consumption_list[3]]) + (0.4 * df[fuel_consumption_list[4]])
)


In [None]:
df['fuel_consumption'].corr(ciec_df['fuel_consumption'])

0.9999999999999998

In [None]:
df['fuel_consumption'].isnull().mean()

0.11536358665937671

petroleum_imports_crudeOil

In [None]:
petroleum_imports_crudeOil_list = get_correlating_list('petroleum_imports_crudeOil',strong_correlations)
print(petroleum_imports_crudeOil_list)

['Diesel User Price']


In [None]:
# need to get this above 0.95
ciec_df['petroleum_imports_crudeOil'].corr(ciec_df['Diesel User Price'])

0.812372739052149

In [None]:
df['petroleum_imports_crudeOil'] = df['petroleum_imports_crudeOil'].fillna(

)

ValueError: Must specify a fill 'value' or 'method'.

GDP Goods and Services

In [None]:
goods_services_list = get_correlating_list('GDP Goods and Services',strong_correlations)
print(goods_services_list)

In [None]:
w1,w2,w3,w4,w5 = 1,0,0,0,0

# ciec_df['GDP Goods and Services'].corr((w1*df[goods_services_list[1]])
# +(w2*df[goods_services_list[2]])
# +(w3*df[goods_services_list[3]])
# +(w4*df[goods_services_list[4]])
# +(w5*df[goods_services_list[5]]))

ciec_df['GDP Goods and Services'].corr(0.9*df[goods_services_list[1]]+df[goods_services_list[4]])


GDP: Gross National Income

In [None]:
national_income_list = get_correlating_list('GDP: Gross National Income',strong_correlations)

Government Debt

In [None]:
government_debt_list = get_correlating_list('Government Debt',strong_correlations)

Vehicle Sales

In [None]:
vehicle_sales_list = get_correlating_list('Vehicle Sales',strong_correlations)

Vessels in colombo

In [None]:
vessels_colombo_list = get_correlating_list('No.of Vessels Colombo',strong_correlations)

Tax on export

In [None]:
tax_export_list = get_correlating_list('Tax on Export',strong_correlations)

Tax on Road transport

In [None]:
tax_road_transport_list = get_correlating_list('Tax Road Transport',strong_correlations)

GDP FCE Households

In [None]:
fce_households_list = get_correlating_list('GDP FCE Households',strong_correlations)

Sales 95 Octane

In [None]:
sales_95_list = get_correlating_list('Sales 95 Octane',strong_correlations)

Fuel and other manufacture

In [None]:
fuel_manufacture_list = get_correlating_list('Fuel_other_manufacture',strong_correlations)

## **Building the Demand Proxy Feature Column**

## **Scaling and preprocessing**

## **Analyzing data and Transforming**