In [1]:
#import some libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression, SGDRegressor
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# reading training data (2018-2021)
pd.options.display.float_format = "{:.2f}".format
data = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2018-2021.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,1,January,1,2018,177.50,3118.40,10.16,1.30
1,2,2,January,1,2018,176.30,4055.60,19.32,1.30
2,3,3,January,1,2018,175.10,4470.10,19.32,1.30
3,4,4,January,1,2018,175.50,4203.20,19.20,1.30
4,5,5,January,1,2018,176.20,4190.70,18.91,1.30
...,...,...,...,...,...,...,...,...,...
1456,1457,27,December,12,2021,225.30,4442.80,106.89,5.00
1457,1458,28,December,12,2021,223.90,4063.20,106.59,5.00
1458,1459,29,December,12,2021,225.30,3208.60,96.48,5.00
1459,1460,30,December,12,2021,219.60,2880.00,87.03,5.00


In [4]:
data = data.drop(data.columns[0], axis = 1)
data.head()

Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,January,1,2018,177.5,3118.4,10.16,1.3
1,2,January,1,2018,176.3,4055.6,19.32,1.3
2,3,January,1,2018,175.1,4470.1,19.32,1.3
3,4,January,1,2018,175.5,4203.2,19.2,1.3
4,5,January,1,2018,176.2,4190.7,18.91,1.3


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   day                   1461 non-null   int64  
 1   month                 1461 non-null   object 
 2   month_no              1461 non-null   int64  
 3   year                  1461 non-null   int64  
 4   Gas supply by Russia  1461 non-null   float64
 5   Gas Consumption       1461 non-null   float64
 6   price                 1461 non-null   float64
 7   inflation rate        1461 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 91.4+ KB


In [6]:
data = data.drop(data.columns[2], axis = 1)
data

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,price,inflation rate
0,1,January,2018,177.50,3118.40,10.16,1.30
1,2,January,2018,176.30,4055.60,19.32,1.30
2,3,January,2018,175.10,4470.10,19.32,1.30
3,4,January,2018,175.50,4203.20,19.20,1.30
4,5,January,2018,176.20,4190.70,18.91,1.30
...,...,...,...,...,...,...,...
1456,27,December,2021,225.30,4442.80,106.89,5.00
1457,28,December,2021,223.90,4063.20,106.59,5.00
1458,29,December,2021,225.30,3208.60,96.48,5.00
1459,30,December,2021,219.60,2880.00,87.03,5.00


In [7]:
# split into X and y
X = data
y = X.pop("price")

In [8]:
# convert categorical variable into dummy/indicator variables
OneHotEn = pd.get_dummies(X)
OneHotEn

Unnamed: 0,day,year,Gas supply by Russia,Gas Consumption,inflation rate,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,1,2018,177.50,3118.40,1.30,0,0,0,0,1,0,0,0,0,0,0,0
1,2,2018,176.30,4055.60,1.30,0,0,0,0,1,0,0,0,0,0,0,0
2,3,2018,175.10,4470.10,1.30,0,0,0,0,1,0,0,0,0,0,0,0
3,4,2018,175.50,4203.20,1.30,0,0,0,0,1,0,0,0,0,0,0,0
4,5,2018,176.20,4190.70,1.30,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,27,2021,225.30,4442.80,5.00,0,0,1,0,0,0,0,0,0,0,0,0
1457,28,2021,223.90,4063.20,5.00,0,0,1,0,0,0,0,0,0,0,0,0
1458,29,2021,225.30,3208.60,5.00,0,0,1,0,0,0,0,0,0,0,0,0
1459,30,2021,219.60,2880.00,5.00,0,0,1,0,0,0,0,0,0,0,0,0


In [9]:
# define scaler
scaler = RobustScaler()

# transform data
scaled = scaler.fit_transform(OneHotEn)

OneHotEn = pd.DataFrame(scaled, columns=OneHotEn.columns)
OneHotEn.head()

Unnamed: 0,day,year,Gas supply by Russia,Gas Consumption,inflation rate,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,-1.0,-2.0,-0.3,0.69,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.93,-2.0,-0.35,0.96,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.87,-2.0,-0.4,1.08,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.8,-2.0,-0.38,1.01,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.73,-2.0,-0.35,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(OneHotEn, y, train_size=.8, random_state= 123000)

In [11]:
X_train.head()

Unnamed: 0,day,year,Gas supply by Russia,Gas Consumption,inflation rate,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
460,-0.67,-1.0,-0.45,-0.12,0.33,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
482,0.8,-1.0,-0.53,-0.11,0.33,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113,0.53,-2.0,-0.47,-0.18,-0.08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.27,-2.0,-0.47,-0.18,-0.08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
818,0.87,0.0,0.2,0.35,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Linear Regression

In [12]:
LR= LinearRegression()
LR.fit(X_train,y_train)  ## fitting the training data

X_test_pred_LR = LR.predict(X_test)  ## predicted x test

X_train_pred_LR= LR.predict(X_train) ##predicted x train

print('LR trainind score is',LR.score(X_train,y_train))
print('LR testing score is',LR.score(X_test,y_test))

LR trainind score is 0.8676441112922602
LR testing score is 0.8971196761480712


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

RF=RandomForestRegressor()
RF.fit(X_train,y_train) ## fitting the data

x_test_pred_RF = RF.predict(X_test)  ## predicted x test

X_train_pred_RF = RF.predict(X_train)  ## predicted x train

print('RF score for training is',RF.score(X_train,y_train))
print('RF score for testing is',RF.score(X_test,y_test))

RF score for training is 0.9983280040697422
RF score for testing is 0.9923145457936927


## 5 different model

In [14]:
# creating 5 different models
RF = RandomForestRegressor().fit(X_train, y_train)
DT = DecisionTreeRegressor().fit(X_train, y_train)
GBR = GradientBoostingRegressor().fit(X_train, y_train)
LR = LinearRegression().fit(X_train, y_train)
XGB = XGBRegressor().fit(X_train, y_train)

In [15]:
# the evaluation metrics
models = [LR, DT, RF, GBR, XGB]
RMSE = [mean_squared_error(y_test, mod.predict(X_test))**0.5 for mod in models]
MAPE = [mean_absolute_percentage_error(y_test, mod.predict(X_test)) for mod in models]
R2_Score = [r2_score(y_test, mod.predict(X_test)) for mod in models]

In [16]:
# comparing 5 models
Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'MAPE':MAPE, 'R2_Score':R2_Score})

In [17]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,7.52,0.32,0.9
1,Decision Tree,3.9,0.04,0.97
2,Random Forest,2.28,0.03,0.99
3,Gradient Boosting,3.03,0.08,0.98
4,XgBoost,2.51,0.04,0.99


In [18]:
#the evaluation metrics for train
models = [LR, DT, RF, GBR, XGB]

RMSE = [mean_squared_error(y_train, mod.predict(X_train))**0.5 for mod in models]

MAPE = [mean_absolute_percentage_error(y_train, mod.predict(X_train)) for mod in models]

R2_Score = [r2_score(y_train, mod.predict(X_train)) for mod in models]

In [19]:
# comparing 5 models
Models = ['Linear Regression','Decision Tree','Random Forest','Gradient Boosting','XgBoost']
evaluation = pd.DataFrame({'Models':Models,'RMSE':RMSE,'MAPE':MAPE, 'R2_Score':R2_Score})

In [20]:
evaluation

Unnamed: 0,Models,RMSE,MAPE,R2_Score
0,Linear Regression,7.95,0.31,0.87
1,Decision Tree,0.0,0.0,1.0
2,Random Forest,0.88,0.01,1.0
3,Gradient Boosting,1.83,0.06,0.99
4,XgBoost,0.15,0.01,1.0


## Predict

In [21]:
test_data_original = pd.read_csv(r"/Volumes/Extreme Pro/WBS Bootcamp Data Science/Final Project/Data/For Modeling/Final DF_gasprice_2022.csv")
test_data_original.shape

(116, 9)

In [22]:
test_data_original

Unnamed: 0.1,Unnamed: 0,day,month,month_no,year,Gas supply by Russia,Gas Consumption,inflation rate,price
0,1,1,January,1,2022,184.30,1898.40,5.10,73.71
1,2,2,January,1,2022,186.10,2377.00,5.10,77.07
2,3,3,January,1,2022,185.90,3188.20,5.10,80.44
3,4,4,January,1,2022,187.30,3961.60,5.10,88.74
4,5,5,January,1,2022,189.10,4843.60,5.10,91.52
...,...,...,...,...,...,...,...,...,...
111,112,22,April,4,2022,196.80,364.70,7.40,94.88
112,113,23,April,4,2022,195.50,175.30,7.40,94.19
113,114,24,April,4,2022,195.40,206.30,7.40,93.52
114,115,25,April,4,2022,196.70,657.00,7.40,92.83


In [23]:
test_data = test_data_original.drop(test_data_original.columns[[0, 3, 8]], axis = 1)
test_data

Unnamed: 0,day,month,year,Gas supply by Russia,Gas Consumption,inflation rate
0,1,January,2022,184.30,1898.40,5.10
1,2,January,2022,186.10,2377.00,5.10
2,3,January,2022,185.90,3188.20,5.10
3,4,January,2022,187.30,3961.60,5.10
4,5,January,2022,189.10,4843.60,5.10
...,...,...,...,...,...,...
111,22,April,2022,196.80,364.70,7.40
112,23,April,2022,195.50,175.30,7.40
113,24,April,2022,195.40,206.30,7.40
114,25,April,2022,196.70,657.00,7.40


In [24]:
# convert categorical variable into dummy/indicator variables
OneHotEn1 = pd.get_dummies(test_data)
OneHotEn1

Unnamed: 0,day,year,Gas supply by Russia,Gas Consumption,inflation rate,month_April,month_February,month_January,month_March
0,1,2022,184.30,1898.40,5.10,0,0,1,0
1,2,2022,186.10,2377.00,5.10,0,0,1,0
2,3,2022,185.90,3188.20,5.10,0,0,1,0
3,4,2022,187.30,3961.60,5.10,0,0,1,0
4,5,2022,189.10,4843.60,5.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...
111,22,2022,196.80,364.70,7.40,1,0,0,0
112,23,2022,195.50,175.30,7.40,1,0,0,0
113,24,2022,195.40,206.30,7.40,1,0,0,0
114,25,2022,196.70,657.00,7.40,1,0,0,0


In [25]:
# define Robust scaler
scaler1 = RobustScaler()

# transform data
scaled1 = scaler1.fit_transform(OneHotEn1)

test_data = pd.DataFrame(scaled1, columns=OneHotEn1.columns)
test_data.head()

Unnamed: 0,day,year,Gas supply by Russia,Gas Consumption,inflation rate,month_April,month_February,month_January,month_March
0,-1.0,0.0,-0.97,-0.41,-0.35,0.0,0.0,1.0,0.0
1,-0.93,0.0,-0.87,-0.27,-0.35,0.0,0.0,1.0,0.0
2,-0.86,0.0,-0.88,-0.01,-0.35,0.0,0.0,1.0,0.0
3,-0.79,0.0,-0.8,0.23,-0.35,0.0,0.0,1.0,0.0
4,-0.71,0.0,-0.69,0.5,-0.35,0.0,0.0,1.0,0.0


In [26]:
test_data["month_May"] = 0.00
test_data["month_June"] = 0.00
test_data["month_July"] = 0.00
test_data["month_August"] = 0.00
test_data["month_September"] = 0.00
test_data["month_October"] = 0.00
test_data["month_November"] = 0.00
test_data["month_December"] = 0.00

In [27]:
test_data.shape

(116, 17)

In [28]:
# To predict test data using LR
test_preds = RF.predict(test_data)

Feature names must be in the same order as they were in fit.



In [29]:
# Two cloumns are needed to make dataframe 
result = pd.DataFrame({"Predict_Price": test_preds})

In [30]:
result

Unnamed: 0,Predict_Price
0,12.36
1,12.33
2,12.75
3,13.67
4,13.75
...,...
111,28.49
112,28.64
113,28.67
114,28.70


In [31]:
test_data.to_csv("Predict_price_w ENC_w Scal_2022.csv", index= False)