<a href="https://colab.research.google.com/github/ryyutku/DSGP/blob/anuk/Modelling/Model%201/Demand_Forecast_demand_proxy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
files = ["Avg_Daily_Sales_Litres_ceypetco.csv","Avg_Daily_Sales_MT_ceypetco.csv","CIEC Data.csv","GDP_historical_data.csv","Import_Data_WITS.csv","Population_colombo.csv","Sales_IOC.csv","Transport_Report_Tables.csv" ]

In [3]:
ceypetco_sales_l_df = pd.read_csv(files[0]) # int64
ceypetco_sales_m_df = pd.read_csv(files[1]) # int64
ciec_df = pd.read_csv(files[2]) # float64
gdp_df = pd.read_csv(files[3]) # converted to float64
import_df = pd.read_csv(files[4]) # contains float64, int64 for years, object for the partner names and countries
population_df = pd.read_csv(files[5]) # converted float64
ioc_sales_df = pd.read_csv(files[6]) # converted to float64
transport_df = pd.read_csv(files[7]) # converted to float64


In [4]:
# Converting the ioc_sales data into float
ioc_sales_df = ioc_sales_df.replace({r'[\n,]': ''}, regex=True)
ioc_sales_df = ioc_sales_df.astype(float)

In [5]:
# Converting the gdp historical data into float
gdp_df = gdp_df.replace({r'[$%B,]':''},regex=True)
gdp_df = gdp_df.replace('',np.nan)
gdp_df = gdp_df.astype(float)

In [6]:
# converting the population df to float
population_df = population_df.replace({r'[,%]':''},regex=True)
population_df = population_df.replace('',np.nan)
population_df = population_df.astype(float)

In [7]:
# converting the transport df to float
transport_df_str = transport_df.iloc[:,0:2]
transport_df_n = transport_df.iloc[:, 2:].replace({r'[,%]':''},regex=True)
transport_df_n = transport_df_n.astype(float)
transport_df = transport_df_str.join(transport_df_n)

In [8]:
ciec_df['date'] = pd.to_datetime((ciec_df['date']))

In [9]:
ciec_df.columns

Index(['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties ',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture'],
      dtype='object')

In [10]:
len(ciec_df.columns)

28

In [11]:
ciec_cols = ['fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties ',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture']

### **Scaling and Normalizing the Data**

In [12]:
# making a copy
ciec = ciec_df.copy()

In [13]:
date = ciec_df['date']

In [14]:
scaler = MinMaxScaler()

In [15]:
ciec[ciec_cols] = scaler.fit_transform(ciec[ciec_cols])
ciec['date'] = date

### **Building the Demand Proxy Feature Column**

In the dataset of CIEC data, the supposed to be y variable 'fuel_consumption' contains data until 2014, so it is necessary to build a proxy variable with combination of other variables to determine the fuel demand in Sri Lanka.

In [16]:
# Method to calculate correlation against 'fuel_consumption' and other features
def fuel_consumption_corr(df,feature):
  # Get the columns of the features to be correlated agains fuel_consumption in ciec
  fuel_consumption = df[ciec['fuel_consumption'].notna()]
  feature_corr = df[df[feature].notna()]

  # Identifying the data ranges which both the columns are valid
  start_date = max(fuel_consumption['date'].min(),feature_corr['date'].min())
  end_date = min(fuel_consumption['date'].max(),feature_corr['date'].max())

  # Filtering the data range to only include the intersecting data range
  filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

  # Calculating the correlation
  correlation = filtered_df['fuel_consumption'].corr(filtered_df[feature])

  return correlation




There are 7 factors that can summarized as to which can affect demand.


*   Income of consumers
*   Price of substitutes
*   Price of complements
*   Changes in tast/Fashion
*   Seasonal
*   Advertising
*   Price

Features of the CIEC data dataset will be distributed accordingly to have a proper fixed "target variable"



In [17]:
income_of_consumers = [
    'GDP: Gross National Income', 'Foreign Direct Investments',
    'GDP FCE Households', 'Tax income profits_gains', 'Household_income'
]

price_of_substitutes = ['Sales 90 Octane', 'Sales 95 Octane','Sales Auto Diesel','Consumption_Oil']
price_of_complements = ['New Vehicle Registrations', 'Vehicle Sales','Vehicle Sales Asia','No.of Vessels Colombo','Port Stay Duration','Fuel_other_manufacture']
price = ['Diesel User Price','Petrol User Price']
seasonal = ['Imports of Refined Products','Colombo port calls']


In [18]:
demand_factors = income_of_consumers + price_of_substitutes + price_of_complements + price + seasonal

In [19]:
for cols in demand_factors:
  print(cols,":",fuel_consumption_corr(ciec,cols))

GDP: Gross National Income : 0.5771154097165387
Foreign Direct Investments : 0.016701608742766813
GDP FCE Households : 0.6213174075360737
Tax income profits_gains : 0.6797987093453305
Household_income : -0.7464069423338254
Sales 90 Octane : 0.7638219560761796
Sales 95 Octane : 0.7767870472492998
Sales Auto Diesel : 0.9642519644854153
Consumption_Oil : 0.9598220451897032
New Vehicle Registrations : 0.710940691420204
Vehicle Sales : 0.8283821219636843
Vehicle Sales Asia : -0.07213670624393759
No.of Vessels Colombo : 0.17888589812176617
Port Stay Duration : nan
Fuel_other_manufacture : nan
Diesel User Price : 0.42929679961766626
Petrol User Price : -0.8393110671118198
Imports of Refined Products : 0.6530356879648631
Colombo port calls : nan


  c /= stddev[:, None]
  c /= stddev[None, :]


Checking correlation with other features that are not in demand factors

In [20]:
selected_columns = set(income_of_consumers + price_of_complements + price_of_substitutes + price + seasonal)
other_cols = [col for col in ciec_cols if col not in selected_columns]

print(other_cols)


['fuel_consumption', 'petroleum_imports_crudeOil', 'Taxes_on_Customs_and_Other_Import Duties ', 'GDP Goods and Services', 'Government Debt', 'Tax on Export', 'Tax Goods & Services', 'Tax Road Transport']


In [21]:
for col in other_cols:
  print(col,":",fuel_consumption_corr(ciec,col))

fuel_consumption : 0.9999999999999998
petroleum_imports_crudeOil : 0.7242259405166621
Taxes_on_Customs_and_Other_Import Duties  : -0.7757441984816745
GDP Goods and Services : 0.7256692082264405
Government Debt : nan
Tax on Export : 0.4514010595209357
Tax Goods & Services : -0.7490537772963195
Tax Road Transport : 0.7285810018850449


Checking the correlation with the demand forecasting factors

In [22]:
def calculate_category_correlation(df, category_features, category_name):

    # Convert 'date' column to datetime if not already
    df['date'] = pd.to_datetime(df['date'])

    # Drop rows where 'fuel_consumption' is NaN
    df_valid_fuel = df.dropna(subset=['fuel_consumption'])

    # Drop rows where all category features are NaN
    df_valid_category = df.dropna(subset=category_features, how='all')

    # Find the overlapping date range where both 'fuel_consumption' and the category features exist
    start_date = max(df_valid_fuel['date'].min(), df_valid_category['date'].min())
    end_date = min(df_valid_fuel['date'].max(), df_valid_category['date'].max())

    # Filter data to include only the overlapping date range
    df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    # Create a new column representing the aggregated category (e.g., mean of all category features)
    df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)

    # Compute correlation between 'fuel_consumption' and the aggregated category
    correlation = df_filtered[['fuel_consumption', category_name]].corr().iloc[0, 1]

    return correlation



In [23]:
print(ciec.columns)

Index(['date', 'fuel_consumption', 'petroleum_imports_crudeOil',
       'Taxes_on_Customs_and_Other_Import Duties ',
       'Foreign Direct Investments', 'GDP Goods and Services',
       'GDP: Gross National Income', 'Government Debt',
       'New Vehicle Registrations', 'Vehicle Sales', 'Port Stay Duration',
       'Vehicle Sales Asia', 'No.of Vessels Colombo',
       'Imports of Refined Products', 'Colombo port calls',
       'Tax income profits_gains', 'Tax on Export', 'Tax Goods & Services',
       'Tax Road Transport', 'GDP FCE Households', 'Diesel User Price',
       'Petrol User Price', 'Consumption_Oil', 'Sales 90 Octane',
       'Sales 95 Octane', 'Sales Auto Diesel', 'Household_income',
       'Fuel_other_manufacture'],
      dtype='object')


In [24]:
print(income_of_consumers)

['GDP: Gross National Income', 'Foreign Direct Investments', 'GDP FCE Households', 'Tax income profits_gains', 'Household_income']


In [25]:
# Compute correlation between 'fuel_consumption' and 'income_of_consumers'
correlation = calculate_category_correlation(ciec, income_of_consumers, "Income_of_Consumers")

print(f"Correlation between fuel_consumption and Income of Consumers: {correlation}")

Correlation between fuel_consumption and Income of Consumers: 0.01178330649220857


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)


In [26]:
# Compute correlation between 'fuel_consumption' and 'income_of_consumers'
correlation = calculate_category_correlation(ciec, price_of_substitutes, "Price_Of_Substitutes")

print(f"Correlation between fuel_consumption and Price of Substitute: {correlation}")

Correlation between fuel_consumption and Price of Substitute: 0.9567724317957925


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)


In [27]:
# Compute correlation between 'fuel_consumption' and 'income_of_consumers'
correlation = calculate_category_correlation(ciec, price_of_complements, "Price_Of_Complements")

print(f"Correlation between fuel_consumption and Price of Complements: {correlation}")

Correlation between fuel_consumption and Price of Complements: 0.78387068377676


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)


In [28]:
# Compute correlation between 'fuel_consumption' and 'income_of_consumers'
correlation = calculate_category_correlation(ciec, price, "Price")

print(f"Correlation between fuel_consumption and Price: {correlation}")

Correlation between fuel_consumption and Price: -0.15192262464733997


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)


In [29]:
# Compute correlation between 'fuel_consumption' and 'income_of_consumers'
correlation = calculate_category_correlation(ciec, seasonal, "Seasonal")

print(f"Correlation between fuel_consumption and Seasonal: {correlation}")

Correlation between fuel_consumption and Seasonal: 0.6530356879648683


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[category_name] = df_filtered[category_features].mean(axis=1, skipna=True)


Setting the proxy target variable

In [30]:
w1= 1.0 # price of substitutes
w2= 0.4 # price of complements
w3= 0.4 # Sales 90 Octane
w4= 0.4 # Sales 95 Octane
w5= 1.0 # Sales Auto Diesel
w6= 1.0 # Consumption Oil
w7= 0.7 # Vehicle Sales
w8= -0.7 # Petrol User Price
w9= -0.5 # Taxes on Customs and Other Imports

In [153]:
# + (w2*ciec[price_of_complements].mean(axis=1, skipna=True))
# (w3*ciec['Sales 90 Octane']).fillna(0) + (w4*ciec['Sales 95 Octane']).fillna(0) +
#
#  + (w8*ciec['Petrol User Price'].fillna(0).fillna(0)) +
#

demand = ((w1*ciec[price_of_substitutes].mean(axis=1, skipna=True)) +(w5*ciec['Sales Auto Diesel']).fillna(0)+(ciec['fuel_consumption'].fillna(0))
 + (w9*ciec['Taxes_on_Customs_and_Other_Import Duties '].fillna(0)) )

#
# + (w7*ciec['Vehicle Sales'].fillna(0))
# (w6*ciec['Consumption_Oil'].fillna(0))

In [154]:
ciec['demand'] = demand

In [155]:
ciec[['date', 'fuel_consumption', 'demand']].dropna()

Unnamed: 0,date,fuel_consumption,demand
0,1971-01-04,0.090559,0.118957
1,1971-01-11,0.090559,0.118957
2,1971-01-18,0.090559,0.118957
3,1971-01-25,0.090559,0.118957
4,1971-02-01,0.090559,0.118957
...,...,...,...
2291,2014-12-01,1.000000,2.370318
2292,2014-12-08,1.000000,2.370318
2293,2014-12-15,1.000000,2.370318
2294,2014-12-22,1.000000,2.370318


In [156]:
ciec_df['fuel_consumption'].corr(ciec['demand'])

0.9855394883281546