# If you like the notebook please upvote it

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

# **Loading the datasets**

In [None]:
data = pd.read_csv("/kaggle/input/us-gasoline-and-diesel-retail-prices-19952021/PET_PRI_GND_DCUS_NUS_W.csv")
data

# **Saving a copy of the datasets**

In [None]:
data_copy = data.copy()

# **EDA**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

**lets split the date column**

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

**lets look for a trend in the data**

In [None]:
years = data['Year'].unique()

In [None]:
for year in years:
    data_to_plot = data[data['Year']==year].D1
    plt.title(f'Data from {year}')
    plt.plot(data_to_plot)
    plt.show()

**all cols vs D1**

In [None]:
for col in data.columns[1:12]:
    plt.title(f'{col} vs D1')
    plt.scatter(data[col], data['D1'], c='red')
    plt.show()

**distplot of all columns**

In [None]:
for col in data.columns[1:12]:
    plt.title(f'Distplot of {col}')
    sns.distplot(data[col])
    plt.show()

# **Data processing**

**lets drop the useless columns**

In [None]:
data.drop(['Date', 'Year', 'Month', 'Day'], axis=1, inplace=True)

**null values?**

In [None]:
data.isnull().sum().any()

**detecting outliars**

In [None]:
for col in data.columns:
    plt.title(f'Boxplot of {col}')
    sns.boxplot(data[col])
    plt.show()

**there are no outliars in the data**

**Feature Correlation**

In [None]:
corr = data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, cmap='coolwarm', square=True, fmt='.2f')
plt.show()

**scaling the data**

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [None]:
robust = RobustScaler()
minmax = MinMaxScaler()

In [None]:
for col in data.columns:
  data[col] = robust.fit_transform(data[col].values.reshape(-1,1))
  data[col] = minmax.fit_transform(data[col].values.reshape(-1,1))

# SPLITTING DATA INTO TRAINING AND TESTING SETS

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X, y = data.drop('D1', axis=1), data['D1']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

# **SPLITTING TRAINING DATA**

In [None]:
x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=56)

# **Model selection**

In [None]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, RidgeCV, ElasticNet

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def model_selection(x_train_, x_val, y_train_, y_val, model):
  model = model()
  model.fit(x_train_, y_train_)

  pred = model.predict(x_val)

  error = np.sqrt(mean_squared_error(y_val, pred))
  acc = r2_score(y_val, pred)
  train_score = model.score(x_train_, y_train_)
  val_score = model.score(x_val, y_val)

  print('Error:', error*100)
  print('\n')
  print('ACC :', acc*100)
  print('\n')
  print('Train Score:', train_score*100)
  print('\n')
  print('Val Score:', val_score*100)
  print('\n')
  print('Is overfitting:', True if train_score>val_score else False)
  print('\n')
  print('Overfitting by:',train_score*100-val_score*100)

In [None]:
extratrees = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreesRegressor)
extratrees

In [None]:
gradient = model_selection(x_train_, x_val, y_train_, y_val, GradientBoostingRegressor)
gradient

In [None]:
randomforest = model_selection(x_train_, x_val, y_train_, y_val, RandomForestRegressor)
randomforest

In [None]:
ada = model_selection(x_train_, x_val, y_train_, y_val, AdaBoostRegressor)
ada

In [None]:
xgb = model_selection(x_train_, x_val, y_train_, y_val, XGBRegressor)
xgb

In [None]:
catboost = model_selection(x_train_, x_val, y_train_, y_val, CatBoostRegressor)
catboost

In [None]:
sgd = model_selection(x_train_, x_val, y_train_, y_val, SGDRegressor)
sgd

In [None]:
linear = model_selection(x_train_, x_val, y_train_, y_val, LinearRegression)
linear

In [None]:
lasso = model_selection(x_train_, x_val, y_train_, y_val, Lasso)
lasso

In [None]:
net = model_selection(x_train_, x_val, y_train_, y_val, ElasticNet)
net

In [None]:
ridge = model_selection(x_train_, x_val, y_train_, y_val, RidgeCV)
ridge

**I will use Linear Regression**

# Model building and training

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

# Predictions

In [None]:
pred = model.predict(x_test)
pred

# Metric check

**mean squared error**

In [None]:
error = np.sqrt(mean_squared_error(y_test, pred))
error*100

**r2 score**

In [None]:
acc = r2_score(y_test, pred)
acc*100

**overfitting rate**

In [None]:
overfitting_by = model.score(x_train, y_train)*100 - model.score(x_test, y_test)*100
overfitting_by

**The model is not overfitting at all**