## Introduction

Features:
- Date
- A1
- A2
- A3
- R1
- R2
- R3
- M1
- M2
- M3
- P1
- P2
- P3
- D1

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import random
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import stats
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
import sklearn
from sklearn.linear_model import LinearRegression
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path = '../input/us-gasoline-and-diesel-retail-prices-19952021/PET_PRI_GND_DCUS_NUS_W.csv'
df = pd.read_csv(path, delimiter=',')
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

In [None]:
df.head(3)

In [None]:
df.tail(3)

In [None]:
ax = sns.distplot(df['D1']) # histogram distribution

## Correlation Matrix
Show the feature importances in a heat map

In [None]:
top = 15
corr = df.corr()
top15 = corr.nlargest(top, 'D1')['D1'].index
corr_top15 = df[top15].corr()
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_top15, square=True, ax=ax, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size':12})
plt.title('Top correlated features of dataset', size=16)
plt.show()

Conclusion: In this dataset all the columns are highly correlated to each other.

In [None]:
sns.scatterplot(x= 'M1', y= 'P1', hue= 'D1', data=df) # smoothness_mean 

Conclusion: In this dataset all the columns are highly correlated to each other.

## Missing values

In [None]:
df.isna().sum() # missing values

Conclusion: In this dataset there are no missing values.

In [None]:
print('total number of duplicate values : ',sum(df.duplicated()))

Conclusion: In this dataset there are no duplicates.

## Outliers

In [None]:
df.describe()

In [None]:
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=df["D1"])

Conclusion: There are no outliers or data too far from the mean.

## Feature engineering
Create new calculated features based on exsting features.

In [None]:
df.dtypes

In [None]:
df.Date = df.Date.apply(pd.to_datetime) # pd.to_datetime(df['Date'])
df['day'] = df['Date'].apply(lambda x:x.day)
df['month'] = df['Date'].apply(lambda x:x.month)
df['year'] = df['Date'].apply(lambda x:x.year)
df['dayofweek'] = df['Date'].apply(lambda x:x.dayofweek)
#df['weekofyear'] = df['Date'].apply(lambda x:x.isocalendar().week)
#df['is_weekend'] = df['Date'].apply(lambda x:x.dayofweek // 5)
df.drop('Date',axis=1,inplace=True)
df.dtypes

In [None]:
df.head(3)

## Splitting data into training and test set

In [None]:
from sklearn.model_selection import train_test_split
target = df['D1']
X_train, X_test, y_train, y_test = train_test_split(df,target,test_size=0.20)
print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}")

## Select and evaluate the model

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
model = XGBRegressor()
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
print('Training Score:', score)
score = model.score(X_test, y_test)
print('Testing Score:', score)
output = pd.DataFrame({'Predicted':Y_pred})

In [None]:
mae = np.round(mean_absolute_error(y_test,Y_pred),3)
print('Mean Absolute Error:', mae)

In [None]:
mse = np.round(mean_squared_error(y_test,Y_pred),3)
print('Mean Squared Error:', mse)

In [None]:
score = np.round(r2_score(y_test,Y_pred),3)
print('R2 Score:', score)

In [None]:
output.head(3)

In [None]:
output.to_csv("submission.csv", index=False)
print('success')

Next Steps: If the score is not acceptable then we can do more feature engineering or test other models, but in this case we are good.