In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression, SGDRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Training model onto data from 2018-2021

### This notebook contains the modeling without categorical encoding and without scaling

In [40]:
# reading training data (2018-2021)
pd.options.display.float_format = "{:.2f}".format
data = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2018-2021.csv")

In [41]:
data.head()

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,1,January,1,2018,177.5,3118.4,10.16,1.3
1,2,2,January,1,2018,176.3,4055.6,19.32,1.3
2,3,3,January,1,2018,175.1,4470.1,19.32,1.3
3,4,4,January,1,2018,175.5,4203.2,19.2,1.3
4,5,5,January,1,2018,176.2,4190.7,18.91,1.3


In [42]:
data = data.drop(data.columns[0], axis = 1)
data.head()

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,January,1,2018,177.5,3118.4,10.16,1.3
1,2,January,1,2018,176.3,4055.6,19.32,1.3
2,3,January,1,2018,175.1,4470.1,19.32,1.3
3,4,January,1,2018,175.5,4203.2,19.2,1.3
4,5,January,1,2018,176.2,4190.7,18.91,1.3


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day                   1461 non-null   int64  
 1   month                 1461 non-null   object 
 2   month_no              1461 non-null   int64  
 3   year                  1461 non-null   int64  
 4   Gas supply by Russia  1461 non-null   float64
 5   Gas Consumption       1461 non-null   float64
 6   price                 1461 non-null   float64
 7   inflation rate        1461 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 91.4+ KB


In [44]:
data = data.drop(data.columns[1], axis = 1)
data.rename(columns={data.columns[1]: "month"}, inplace=True)
data

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,1,2018,177.50,3118.40,10.16,1.30
1,2,1,2018,176.30,4055.60,19.32,1.30
2,3,1,2018,175.10,4470.10,19.32,1.30
3,4,1,2018,175.50,4203.20,19.20,1.30
4,5,1,2018,176.20,4190.70,18.91,1.30
...,...,...,...,...,...,...,...
1456,27,12,2021,225.30,4442.80,106.89,5.00
1457,28,12,2021,223.90,4063.20,106.59,5.00
1458,29,12,2021,225.30,3208.60,96.48,5.00
1459,30,12,2021,219.60,2880.00,87.03,5.00


In [45]:
data = data[["day", "month", "year", "Gas supply by Russia", "Gas Consumption", "inflation rate", "price"]]
data

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,1,2018,177.50,3118.40,1.30,10.16
1,2,1,2018,176.30,4055.60,1.30,19.32
2,3,1,2018,175.10,4470.10,1.30,19.32
3,4,1,2018,175.50,4203.20,1.30,19.20
4,5,1,2018,176.20,4190.70,1.30,18.91
...,...,...,...,...,...,...,...
1456,27,12,2021,225.30,4442.80,5.00,106.89
1457,28,12,2021,223.90,4063.20,5.00,106.59
1458,29,12,2021,225.30,3208.60,5.00,96.48
1459,30,12,2021,219.60,2880.00,5.00,87.03


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day                   1461 non-null   int64  
 1   month                 1461 non-null   int64  
 2   year                  1461 non-null   int64  
 3   Gas supply by Russia  1461 non-null   float64
 4   Gas Consumption       1461 non-null   float64
 5   inflation rate        1461 non-null   float64
 6   price                 1461 non-null   float64
dtypes: float64(4), int64(3)
memory usage: 80.0 KB


In [47]:
# split into X and y
X = data
y = X.pop("price")

In [48]:
# splitting into test- and train-set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=123000)

In [49]:
X_train.head()

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,inflation rate
460,6,4,2019,173.7,405.5,1.7
482,28,4,2019,171.7,410.5,1.7
113,24,4,2018,173.2,178.0,1.2
109,20,4,2018,173.1,187.6,1.2
818,29,3,2020,190.4,1964.8,0.7


## Linear Regression

In [50]:
LR= LinearRegression()
LR.fit(X_train,y_train)  ## fitting the training data

X_test_pred_LR = LR.predict(X_test)  ## predicted x test

X_train_pred_LR= LR.predict(X_train) ##predicted x train

print("LR trainind score is",LR.score(X_train,y_train))
print("LR testing score is",LR.score(X_test,y_test))

LR trainind score is 0.8550828420609261
LR testing score is 0.8881939182467192


## Random Forest

In [51]:
from sklearn.ensemble import RandomForestRegressor

RF=RandomForestRegressor()
RF.fit(X_train,y_train) ## fitting the data

x_test_pred_RF = RF.predict(X_test)  ## predicted x test

X_train_pred_RF = RF.predict(X_train)  ## predicted x train

print("RF score for training is",RF.score(X_train,y_train))
print("RF score for testing is",RF.score(X_test,y_test))

RF score for training is 0.9984619692396061
RF score for testing is 0.9902062864925921


## 5 different model

In [52]:
# creating 5 different models
RF = RandomForestRegressor().fit(X_train, y_train)
DT = DecisionTreeRegressor().fit(X_train, y_train)
GBR = GradientBoostingRegressor().fit(X_train, y_train)
LR = LinearRegression().fit(X_train, y_train)
XGB = XGBRegressor().fit(X_train, y_train)

In [53]:
# the evaluation metrics
models = [LR, DT, RF, GBR, XGB]
RMSE = [mean_squared_error(y_test, mod.predict(X_test))**0.5 for mod in models]
MAPE = [mean_absolute_percentage_error(y_test, mod.predict(X_test)) for mod in models]
R2_Score = [r2_score(y_test, mod.predict(X_test)) for mod in models]

In [54]:
# comparing 5 models
Models = ["Linear Regression","Decision Tree","Random Forest","Gradient Boosting","XgBoost"]
evaluation = pd.DataFrame({"Models":Models,"RMSE":RMSE,"MAPE":MAPE, "R2_Score":R2_Score})

In [55]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,7.84,0.33,0.89
1,Decision Tree,3.94,0.04,0.97
2,Random Forest,2.22,0.03,0.99
3,Gradient Boosting,2.7,0.07,0.99
4,XgBoost,2.34,0.04,0.99


In [56]:
#the evaluation metrics for train
models = [LR, DT, RF, GBR, XGB]

RMSE = [mean_squared_error(y_train, mod.predict(X_train))**0.5 for mod in models]

MAPE = [mean_absolute_percentage_error(y_train, mod.predict(X_train)) for mod in models]

R2_Score = [r2_score(y_train, mod.predict(X_train)) for mod in models]

In [57]:
# comparing 5 models
Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'MAPE':MAPE, 'R2_Score':R2_Score})

In [58]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,8.32,0.31,0.86
1,Decision Tree,0.0,0.0,1.0
2,Random Forest,0.88,0.01,1.0
3,Gradient Boosting,1.87,0.06,0.99
4,XgBoost,0.16,0.01,1.0


## Predict prices for 2022

In [59]:
test_data_original = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2022.csv")
test_data_original.shape

(116, 9)

In [60]:
test_data_original

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,1,January,1,2022,184.30,1898.40,5.10,73.71
1,2,2,January,1,2022,186.10,2377.00,5.10,77.07
2,3,3,January,1,2022,185.90,3188.20,5.10,80.44
3,4,4,January,1,2022,187.30,3961.60,5.10,88.74
4,5,5,January,1,2022,189.10,4843.60,5.10,91.52
...,...,...,...,...,...,...,...,...,...
111,112,22,April,4,2022,196.80,364.70,7.40,94.88
112,113,23,April,4,2022,195.50,175.30,7.40,94.19
113,114,24,April,4,2022,195.40,206.30,7.40,93.52
114,115,25,April,4,2022,196.70,657.00,7.40,92.83


In [61]:
test_data = test_data_original.drop(test_data_original.columns[[0, 8]], axis = 1)
test_data

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate
0,1,January,1,2022,184.30,1898.40,5.10
1,2,January,1,2022,186.10,2377.00,5.10
2,3,January,1,2022,185.90,3188.20,5.10
3,4,January,1,2022,187.30,3961.60,5.10
4,5,January,1,2022,189.10,4843.60,5.10
...,...,...,...,...,...,...,...
111,22,April,4,2022,196.80,364.70,7.40
112,23,April,4,2022,195.50,175.30,7.40
113,24,April,4,2022,195.40,206.30,7.40
114,25,April,4,2022,196.70,657.00,7.40


In [62]:
test_data = test_data.drop(test_data.columns[1], axis = 1)
test_data.rename(columns={test_data.columns[1]: "month"}, inplace=True)
test_data

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,inflation rate
0,1,1,2022,184.30,1898.40,5.10
1,2,1,2022,186.10,2377.00,5.10
2,3,1,2022,185.90,3188.20,5.10
3,4,1,2022,187.30,3961.60,5.10
4,5,1,2022,189.10,4843.60,5.10
...,...,...,...,...,...,...
111,22,4,2022,196.80,364.70,7.40
112,23,4,2022,195.50,175.30,7.40
113,24,4,2022,195.40,206.30,7.40
114,25,4,2022,196.70,657.00,7.40


In [63]:
# To predict test data using RandomForest
test_preds = RF.predict(test_data)

In [64]:
# Two cloumns are needed to make dataframe 
result = pd.DataFrame({'Predict_price': test_preds})

In [65]:
result

Unnamed: 0,Predict_price
0,69.24
1,69.55
2,70.81
3,72.39
4,75.69
...,...
111,89.32
112,89.06
113,88.91
114,87.20


In [67]:
result.to_csv("Predict_price_wo ENC_wo Scal_2022.csv", index= False)