In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Dataset Used
US gasoline and diesel retail prices 1995-2021

Contains...
* A1: Weekly U.S. All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)
* A2: Weekly U.S. All Grades Conventional Retail Gasoline Prices (Dollars per Gallon)
* A3: Weekly U.S. All Grades Reformulated Retail Gasoline Prices (Dollars per Gallon)
* R1: Weekly U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)
* R2: Weekly U.S. Regular Conventional Retail Gasoline Prices (Dollars per Gallon)
* R3: Weekly U.S. Regular Reformulated Retail Gasoline Prices (Dollars per Gallon)
* M1: Weekly U.S. Midgrade All Formulations Retail Gasoline Prices (Dollars per Gallon)
* M2: Weekly U.S. Midgrade Conventional Retail Gasoline Prices (Dollars per Gallon)
* M3: Weekly U.S. Midgrade Reformulated Retail Gasoline Prices (Dollars per Gallon)
* P1: Weekly U.S. Premium All Formulations Retail Gasoline Prices (Dollars per Gallon)
* P2: Weekly U.S. Premium Conventional Retail Gasoline Prices (Dollars per Gallon)
* P3: Weekly U.S. Premium Reformulated Retail Gasoline Prices (Dollars per Gallon)
* D1: Weekly U.S. No 2 Diesel Retail Prices (Dollars per Gallon)

# **STEP 1. Dataset Prepare**

In [None]:
dataset = pd.read_csv('/kaggle/input/us-gasoline-and-diesel-retail-prices-19952021/PET_PRI_GND_DCUS_NUS_W.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

# **STEP 2. Convert the type of the 'Date' Column**

Convert the type of 'Date' column in dataset. Object Type to Datetime Type. 
And divide by year, month and day.

In [None]:
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%m/%d/%Y')
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day
print(dataset.info())
print(dataset.head())

# **STEP 3. Partition yearly / monthly / daily**

To see how the price change by yearly, monthly, daily flow, I divded the dataset

In [None]:
yearly_data = dataset.drop(['Date', 'Month', 'Day'], axis=1)
monthly_data = dataset.drop(['Date', 'Year', 'Day'], axis=1)
daily_data = dataset.drop(['Date', 'Year', 'Month'], axis=1)
print(yearly_data.head())
print(monthly_data.head())
print(daily_data.head())

caculation of the average price of each year, month, day

In [None]:
yearly_mean= yearly_data.groupby('Year', as_index=False).mean()
monthly_mean = monthly_data.groupby('Month', as_index=False).mean()
daily_mean = daily_data.groupby('Day', as_index=False).mean()
print(yearly_mean.head())
print(monthly_mean.head())
print(daily_mean.head())

# **STEP 4. Price change (Yearly)**

Plot graph to see how price change as the time goes

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(5, 1, figsize=(15,20))
plt.setp(axes, xticks=range(1995,2021,1), xlabel='Year', ylabel='Dollars per Gallon')
axes[0].set_title('Yearly Change (All Grades)')
for n in range(1,4):
    sns.lineplot(ax=axes[0], x='Year', y='A{}'.format(n), data=yearly_mean, label='A{}'.format(n), marker='o')
axes[0].legend()

axes[1].set_title('Yearly Change (Regular)')
for n in range(1,4):
    sns.lineplot(ax=axes[1], x='Year', y='R{}'.format(n), data=yearly_mean, label='R{}'.format(n), marker='o')
axes[1].legend()

axes[2].set_title('Yearly Change (Midgrade)')
for n in range(1,4):
    sns.lineplot(ax=axes[2], x='Year', y='M{}'.format(n), data=yearly_mean, label='M{}'.format(n), marker='o')
axes[2].legend()

axes[3].set_title('Yearly Change (Premium)')
for n in range(1,4):
    sns.lineplot(ax=axes[3], x='Year', y='P{}'.format(n), data=yearly_mean, label='P{}'.format(n), marker='o')
axes[3].legend()

axes[4].set_title('Yearly Change (No 2 Diesel)')
sns.lineplot(ax=axes[4], x='Year', y='D1', data=yearly_mean, label='D1', marker='o')
axes[4].legend()

fig.tight_layout()

Yearly price fluctuation of both Gasoline and Diesel is very similar to each other, highest in 2012 and lowest in 1998 

# **STEP 5. Price change (Monthly)**

In [None]:
fig, axes = plt.subplots(5, 1, figsize=(15,20))
plt.setp(axes, xticks=range(1,13,1), xlabel='Month', ylabel='Dollars per Gallon')
axes[0].set_title('Monthly Change (All Grades)')
for n in range(1,4):
    sns.lineplot(ax=axes[0], x='Month', y='A{}'.format(n), data=monthly_mean, label='A{}'.format(n), marker='o')
axes[0].legend()

axes[1].set_title('Monthly Change (Regular)')
for n in range(1,4):
    sns.lineplot(ax=axes[1], x='Month', y='R{}'.format(n), data=monthly_mean, label='R{}'.format(n), marker='o')
axes[1].legend()

axes[2].set_title('Monthly Change (Midgrade)')
for n in range(1,4):
    sns.lineplot(ax=axes[2], x='Month', y='M{}'.format(n), data=monthly_mean, label='M{}'.format(n), marker='o')
axes[2].legend()

axes[3].set_title('Monthly Change (Premium)')
for n in range(1,4):
    sns.lineplot(ax=axes[3], x='Month', y='P{}'.format(n), data=monthly_mean, label='P{}'.format(n), marker='o')
axes[3].legend()

axes[4].set_title('Monthly Change (No 2 Diesel)')
sns.lineplot(ax=axes[4], x='Month', y='D1', data=monthly_mean, label='D1', marker='o')
axes[4].legend()

fig.tight_layout()

Gasoline Price was highest in month of June and lowest in month of January
Diesel Price was highest in month of September and lowest in month of January

# **STEP 6. Price change (Daily)**

In [None]:
fig, axes = plt.subplots(5, 1, figsize=(15,20))
plt.setp(axes, xticks=range(1,32,1), xlabel='Day', ylabel='Dollars per Gallon')
axes[0].set_title('Daily Change (All Grades)')
for n in range(1,4):
    sns.lineplot(ax=axes[0], x='Day', y='A{}'.format(n), data=daily_mean, label='A{}'.format(n), marker='o')
axes[0].legend()

axes[1].set_title('Daily Change (Regular)')
for n in range(1,4):
    sns.lineplot(ax=axes[1], x='Day', y='R{}'.format(n), data=daily_mean, label='R{}'.format(n), marker='o')
axes[1].legend()

axes[2].set_title('Daily Change (Midgrade)')
for n in range(1,4):
    sns.lineplot(ax=axes[2], x='Day', y='M{}'.format(n), data=daily_mean, label='M{}'.format(n), marker='o')
axes[2].legend()

axes[3].set_title('Daily Change (Premium)')
for n in range(1,4):
    sns.lineplot(ax=axes[3], x='Day', y='P{}'.format(n), data=daily_mean, label='P{}'.format(n), marker='o')
axes[3].legend()

axes[4].set_title('Daily Change (No 2 Diesel)')
sns.lineplot(ax=axes[4], x='Day', y='D1', data=daily_mean, label='D1', marker='o')
axes[4].legend()

fig.tight_layout()

Diesel Price increased dramatically at the 7th, 14th, 21th, 28th day and decreased dramatically right after
Similar to daily price fluctuation of the Gasoline

# **STEP 7. Price change (Complete Datetime)**

In [None]:
gasoline_A1 = dataset[['Date', 'A1']]
gasoline_A2 = dataset[['Date', 'A2']]
gasoline_A3 = dataset[['Date', 'A3']]

plt.subplots(figsize=(15,8))
plt.title('Gasoline Retail Prices All Grades (1995-2020)')
plt.plot('Date', 'A1', data=gasoline_A1)
plt.plot('Date', 'A2', data=gasoline_A2)
plt.plot('Date', 'A3', data=gasoline_A3)
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.legend()
plt.show()

In [None]:
gasoline_R1 = dataset[['Date', 'R1']]
gasoline_R2 = dataset[['Date', 'R2']]
gasoline_R3 = dataset[['Date', 'R3']]

plt.subplots(figsize=(15,8))
plt.title('Gasoline Retail Prices Regular (1995-2020)')
plt.plot('Date', 'R1', data=gasoline_R1)
plt.plot('Date', 'R2', data=gasoline_R2)
plt.plot('Date', 'R3', data=gasoline_R3)
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.legend()
plt.show()

In [None]:
gasoline_M1 = dataset[['Date', 'M1']]
gasoline_M2 = dataset[['Date', 'M2']]
gasoline_M3 = dataset[['Date', 'M3']]

plt.subplots(figsize=(15,8))
plt.title('Gasoline Retail Prices Midgrade (1995-2020)')
plt.plot('Date', 'M1', data=gasoline_M1)
plt.plot('Date', 'M2', data=gasoline_M2)
plt.plot('Date', 'M3', data=gasoline_M3)
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.legend()
plt.show()

In [None]:
gasoline_P1 = dataset[['Date', 'P1']]
gasoline_P2 = dataset[['Date', 'P2']]
gasoline_P3 = dataset[['Date', 'P3']]

plt.subplots(figsize=(15,8))
plt.title('Gasoline Retail Prices Premium (1995-2020)')
plt.plot('Date', 'P1', data=gasoline_P1)
plt.plot('Date', 'P2', data=gasoline_P2)
plt.plot('Date', 'P3', data=gasoline_P3)
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.legend()
plt.show()

In [None]:
diesel = dataset[['Date', 'D1']]

plt.subplots(figsize=(15,8))
plt.title("Diesel Retail Prices (1995-2020)")
plt.plot('Date', 'D1', data=diesel)
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.show()

# **STEP 8. Machine Learning Model for Predction**

In [None]:
dataset.head()

In [None]:
input_data = dataset[['Year', 'Month', 'Day']]
cat_key = {'A1':1, 'A2':2, 'A3':3, 'R1':4, 'R2':5, 'R3':6, 'M1':7, 'M2':8, 'M3':9, 'P1':10, 'P2':11, 'P3':12, 'D1':13} 

I used Random Forest Regressor to predict the price.

Considering the slight difference between price of gasoline categories and grades, I defined a function.

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1, oob_score=True, random_state=0)

def predict_price(cat, year, month, day):
    rf.fit(input_data, dataset.iloc[:, cat_key[cat]])  
    print("OOB Score of Random Forest Regressor Model")
    print(rf.oob_score_)
    print("Prediction ({} / {} / {})".format(year, month, day))
    print(rf.predict([[year, month, day]]))

# **STEP 9. Predict the price with the model**

I predict the Diesel price in 2021/2/1.

and my result was...

In [None]:
predict_price('D1', 2021, 2, 1)

To see where my predction result would be located on the graph, once again I used the line graph of diesel in STEP 7.

In [None]:
import datetime
test_date = datetime.datetime.strptime('2021/02/01', '%Y/%m/%d').date()

plt.subplots(figsize=(15,8))
plt.title("Diesel Retail Prices (1995-2020)")
plt.plot('Date', 'D1', data=diesel)
plt.plot(test_date, 2.6736, marker="^")
plt.xlabel('Date')
plt.ylabel('Dollars per Gallon')
plt.show()

Thanks for reading my notebook!!
... and sorry, I'm not good at english...
I learned about Random Forest few days ago and find it's interesting so I apply it on this dataset.
I've just started the Machine Learning so I really welcome all the feedbacks!! :)