In [608]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression, SGDRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### This notebook contains the modeling without categorical encoding and without scaling

## Training model onto data from May 2021 - April 2022

In [609]:
data_20xx = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2018-2021.csv")
data_2022 = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2022.csv")

In [610]:
data_20xx

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,1,January,1,2018,177.50,3118.40,10.16,1.30
1,2,2,January,1,2018,176.30,4055.60,19.32,1.30
2,3,3,January,1,2018,175.10,4470.10,19.32,1.30
3,4,4,January,1,2018,175.50,4203.20,19.20,1.30
4,5,5,January,1,2018,176.20,4190.70,18.91,1.30
...,...,...,...,...,...,...,...,...,...
1456,1457,27,December,12,2021,225.30,4442.80,106.89,5.00
1457,1458,28,December,12,2021,223.90,4063.20,106.59,5.00
1458,1459,29,December,12,2021,225.30,3208.60,96.48,5.00
1459,1460,30,December,12,2021,219.60,2880.00,87.03,5.00


In [611]:
data_20xx = data_20xx.drop(data_20xx.columns[0], axis = 1)
data_20xx.head()

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,January,1,2018,177.5,3118.4,10.16,1.3
1,2,January,1,2018,176.3,4055.6,19.32,1.3
2,3,January,1,2018,175.1,4470.1,19.32,1.3
3,4,January,1,2018,175.5,4203.2,19.2,1.3
4,5,January,1,2018,176.2,4190.7,18.91,1.3


In [612]:
data_20xx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day                   1461 non-null   int64  
 1   month                 1461 non-null   object 
 2   month_no              1461 non-null   int64  
 3   year                  1461 non-null   int64  
 4   Gas supply by Russia  1461 non-null   float64
 5   Gas Consumption       1461 non-null   float64
 6   price                 1461 non-null   float64
 7   inflation rate        1461 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 91.4+ KB


In [613]:
# selecting rows from May 2021 to December 2021
data_20xx = data_20xx.query("year == 2021")
data_20xx = data_20xx.loc[data_20xx["month"].isin(["May", "June", "July", "August", "September", "October", "November", "December"])]
data_20xx

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
1216,1,May,5,2021,203.90,352.50,23.46,2.00
1217,2,May,5,2021,204.70,371.70,23.64,2.00
1218,3,May,5,2021,207.40,914.60,23.82,2.00
1219,4,May,5,2021,206.70,804.20,23.05,2.00
1220,5,May,5,2021,210.20,1415.70,23.96,2.00
...,...,...,...,...,...,...,...,...
1456,27,December,12,2021,225.30,4442.80,106.89,5.00
1457,28,December,12,2021,223.90,4063.20,106.59,5.00
1458,29,December,12,2021,225.30,3208.60,96.48,5.00
1459,30,December,12,2021,219.60,2880.00,87.03,5.00


In [614]:
data_2022

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,1,January,1,2022,184.30,1898.40,5.10,73.71
1,2,2,January,1,2022,186.10,2377.00,5.10,77.07
2,3,3,January,1,2022,185.90,3188.20,5.10,80.44
3,4,4,January,1,2022,187.30,3961.60,5.10,88.74
4,5,5,January,1,2022,189.10,4843.60,5.10,91.52
...,...,...,...,...,...,...,...,...,...
111,112,22,April,4,2022,196.80,364.70,7.40,94.88
112,113,23,April,4,2022,195.50,175.30,7.40,94.19
113,114,24,April,4,2022,195.40,206.30,7.40,93.52
114,115,25,April,4,2022,196.70,657.00,7.40,92.83


In [615]:
data_2022 = data_2022.drop(data_2022.columns[0], axis = 1)
data_2022.head()

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,January,1,2022,184.3,1898.4,5.1,73.71
1,2,January,1,2022,186.1,2377.0,5.1,77.07
2,3,January,1,2022,185.9,3188.2,5.1,80.44
3,4,January,1,2022,187.3,3961.6,5.1,88.74
4,5,January,1,2022,189.1,4843.6,5.1,91.52


In [616]:
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day                   116 non-null    int64  
 1   month                 116 non-null    object 
 2   month_no              116 non-null    int64  
 3   year                  116 non-null    int64  
 4   Gas supply by Russia  116 non-null    float64
 5   Gas Consumption       116 non-null    float64
 6   inflation rate        116 non-null    float64
 7   price                 116 non-null    float64
dtypes: float64(4), int64(3), object(1)
memory usage: 7.4+ KB


In [617]:
# selecting rows from Januar to April 2022
data_2022 = data_2022.loc[data_2022["month"].isin(["January", "February", "March", "April"])]
data_2022

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,January,1,2022,184.30,1898.40,5.10,73.71
1,2,January,1,2022,186.10,2377.00,5.10,77.07
2,3,January,1,2022,185.90,3188.20,5.10,80.44
3,4,January,1,2022,187.30,3961.60,5.10,88.74
4,5,January,1,2022,189.10,4843.60,5.10,91.52
...,...,...,...,...,...,...,...,...
111,22,April,4,2022,196.80,364.70,7.40,94.88
112,23,April,4,2022,195.50,175.30,7.40,94.19
113,24,April,4,2022,195.40,206.30,7.40,93.52
114,25,April,4,2022,196.70,657.00,7.40,92.83


In [618]:
data = pd.concat([data_20xx, data_2022])
data = data.reset_index(drop=True)
data

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,May,5,2021,203.90,352.50,23.46,2.00
1,2,May,5,2021,204.70,371.70,23.64,2.00
2,3,May,5,2021,207.40,914.60,23.82,2.00
3,4,May,5,2021,206.70,804.20,23.05,2.00
4,5,May,5,2021,210.20,1415.70,23.96,2.00
...,...,...,...,...,...,...,...,...
356,22,April,4,2022,196.80,364.70,94.88,7.40
357,23,April,4,2022,195.50,175.30,94.19,7.40
358,24,April,4,2022,195.40,206.30,93.52,7.40
359,25,April,4,2022,196.70,657.00,92.83,7.40


In [619]:
# drop unwanted columns to focus on influence of gas supply on price
data = data.drop(data.columns[[0, 1, 3, 5, 7]], axis=1)
data

Unnamed: 0,month_no,Gas supply by Russia,price
0,5,203.90,23.46
1,5,204.70,23.64
2,5,207.40,23.82
3,5,206.70,23.05
4,5,210.20,23.96
...,...,...,...
356,4,196.80,94.88
357,4,195.50,94.19
358,4,195.40,93.52
359,4,196.70,92.83


In [620]:
# split into X and y
X = data
y = X.pop("price")

In [621]:
# splitting into test- and train-set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=123000)

In [622]:
X_test.head(10)

Unnamed: 0,month_no,Gas supply by Russia
77,7,53.1
354,4,200.6
348,4,199.7
117,8,221.8
331,3,215.9
135,9,202.2
193,11,203.5
288,2,205.2
64,7,216.0
152,9,196.1


## Linear Regression

In [623]:
LR= LinearRegression()
LR.fit(X_train,y_train)  # fitting the training data

X_test_pred_LR = LR.predict(X_test)  # predicted x test

X_train_pred_LR= LR.predict(X_train) # predicted x train

print("LR trainind score is",LR.score(X_train,y_train))
print("LR testing score is",LR.score(X_test,y_test))

LR trainind score is 0.025055339469254978
LR testing score is 0.005126233281866899


## Random Forest

In [624]:
from sklearn.ensemble import RandomForestRegressor

RF=RandomForestRegressor()
RF.fit(X_train,y_train) ## fitting the data

x_test_pred_RF = RF.predict(X_test)  ## predicted x test

X_train_pred_RF = RF.predict(X_train)  ## predicted x train

print("RF score for training is",RF.score(X_train,y_train))
print("RF score for testing is",RF.score(X_test,y_test))

RF score for training is 0.9713462068605712
RF score for testing is 0.7265170169983104


## 5 different model

In [625]:
# creating 5 different models
RF = RandomForestRegressor().fit(X_train, y_train)
DT = DecisionTreeRegressor().fit(X_train, y_train)
GBR = GradientBoostingRegressor().fit(X_train, y_train)
LR = LinearRegression().fit(X_train, y_train)
XGB = XGBRegressor().fit(X_train, y_train)

In [626]:
# the evaluation metrics
models = [LR, DT, RF, GBR, XGB]
RMSE = [mean_squared_error(y_test, mod.predict(X_test))**0.5 for mod in models]
MAPE = [mean_absolute_percentage_error(y_test, mod.predict(X_test)) for mod in models]
R2_Score = [r2_score(y_test, mod.predict(X_test)) for mod in models]

In [627]:
# comparing 5 models
Models = ["Linear Regression","Decision Tree","Random Forest","Gradient Boosting","XgBoost"]
evaluation = pd.DataFrame({"Models":Models,"RMSE":RMSE,"MAPE":MAPE, "R2_Score":R2_Score})

In [628]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,42.56,0.51,0.01
1,Decision Tree,27.9,0.14,0.57
2,Random Forest,22.39,0.11,0.72
3,Gradient Boosting,23.72,0.13,0.69
4,XgBoost,26.18,0.14,0.62


In [629]:
#the evaluation metrics for train
models = [LR, DT, RF, GBR, XGB]

RMSE = [mean_squared_error(y_train, mod.predict(X_train))**0.5 for mod in models]

MAPE = [mean_absolute_percentage_error(y_train, mod.predict(X_train)) for mod in models]

R2_Score = [r2_score(y_train, mod.predict(X_train)) for mod in models]

In [630]:
# comparing 5 models
Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'MAPE':MAPE, 'R2_Score':R2_Score})

In [631]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,34.15,0.57,0.03
1,Decision Tree,3.02,0.01,0.99
2,Random Forest,6.0,0.03,0.97
3,Gradient Boosting,6.28,0.06,0.97
4,XgBoost,3.06,0.01,0.99


#### Predict Scenario 1 for May 2022 - April 2023

In [632]:
test_data_original_Scen1 = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_Supply_Scen1_daily.csv")
test_data_original_Scen1.shape

(361, 6)

In [633]:
test_data_original_Scen1

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia
0,0,1,May,5,2022,203.90
1,1,2,May,5,2022,204.70
2,2,3,May,5,2022,207.40
3,3,4,May,5,2022,206.70
4,4,5,May,5,2022,210.20
...,...,...,...,...,...,...
356,356,22,April,4,2023,196.80
357,357,23,April,4,2023,195.50
358,358,24,April,4,2023,195.40
359,359,25,April,4,2023,196.70


In [634]:
test_data_Scen1 = test_data_original_Scen1.drop(test_data_original_Scen1.columns[[0, 1, 2, 4]], axis = 1)
test_data_Scen1

Unnamed: 0,month_no,Gas supply by Russia
0,5,203.90
1,5,204.70
2,5,207.40
3,5,206.70
4,5,210.20
...,...,...
356,4,196.80
357,4,195.50
358,4,195.40
359,4,196.70


In [635]:
# To predict test data using RandomForest
test_pred_scen1 = RF.predict(test_data_Scen1)

In [636]:
# Two cloumns are needed to make dataframe 
result_Scen1 = pd.DataFrame({"Predict_price": test_pred_scen1})

In [637]:
result_Scen1

Unnamed: 0,Predict_price
0,23.96
1,24.35
2,25.37
3,26.14
4,24.19
...,...
356,96.79
357,94.48
358,94.20
359,96.71


In [638]:
result_Scen1.to_csv("Predict_price 2022-23_wo ENC_wo Scal_RF_Scen1.csv", index= False)

#### Scenario 2 (Russia supplies 50% of amount of gas in previous 12months)

In [639]:
test_data_original_Scen2 = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_Supply_Scen2_daily.csv")
test_data_original_Scen2.shape

(361, 6)

In [640]:
test_data_original_Scen2

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia
0,0,1,May,5,2022,101.95
1,1,2,May,5,2022,102.35
2,2,3,May,5,2022,103.70
3,3,4,May,5,2022,103.35
4,4,5,May,5,2022,105.10
...,...,...,...,...,...,...
356,356,22,April,4,2023,98.40
357,357,23,April,4,2023,97.75
358,358,24,April,4,2023,97.70
359,359,25,April,4,2023,98.35


In [641]:
test_data_Scen2 = test_data_original_Scen2.drop(test_data_original_Scen2.columns[[0, 1, 2, 4]], axis = 1)
test_data_Scen2

Unnamed: 0,month_no,Gas supply by Russia
0,5,101.95
1,5,102.35
2,5,103.70
3,5,103.35
4,5,105.10
...,...,...
356,4,98.40
357,4,97.75
358,4,97.70
359,4,98.35


In [642]:
# To predict test data using RandomForest
test_pred_Scen2 = RF.predict(test_data_Scen2)

In [643]:
# Two cloumns are needed to make dataframe 
result_Scen2 = pd.DataFrame({"Predict_price": test_pred_Scen2})

In [644]:
result_Scen2

Unnamed: 0,Predict_price
0,24.87
1,24.87
2,24.87
3,24.87
4,24.87
...,...
356,98.39
357,98.39
358,98.39
359,98.39


In [645]:
result_Scen2.to_csv("Predict_price 2022-23_wo ENC_wo Scal_RF_Scen2.csv", index= False)