In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [2]:
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [12, 6]
sns.set()

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
## Import Data ##
oil_futures = pd.read_csv('Crude Oil WTI Futures Historical Data_weekly.csv')
oil_futures.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,"Jan 17, 2021",52.27,52.0,53.79,51.44,516.20K,-0.17%
1,"Jan 10, 2021",52.36,52.58,53.93,51.5,1.62M,0.23%
2,"Jan 03, 2021",52.24,48.4,52.75,47.18,2.05M,7.67%
3,"Dec 27, 2020",48.52,48.23,48.96,47.5,901.09K,0.60%
4,"Dec 20, 2020",48.23,48.54,48.62,46.16,835.39K,-1.77%


In [5]:
gasoline = pd.read_csv('gasolin_prices.csv')
gasoline.head()

Unnamed: 0,Date,Weekly U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)
0,"Aug 20, 1990",1.191
1,"Aug 27, 1990",1.245
2,"Sep 03, 1990",1.242
3,"Sep 10, 1990",1.252
4,"Sep 17, 1990",1.266


In [6]:
gasoline_start_index = gasoline[gasoline['Date'] == 'Jan 03, 2000'].index.values[0]
gasoline_start_index

489

In [7]:
gas_df = gasoline[gasoline_start_index:]
gas_df.head()

Unnamed: 0,Date,Weekly U.S. Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)
489,"Jan 03, 2000",1.272
490,"Jan 10, 2000",1.264
491,"Jan 17, 2000",1.277
492,"Jan 24, 2000",1.315
493,"Jan 31, 2000",1.316


In [8]:
df_date = gas_df['Date'].reset_index(drop = True)
df_date

0       Jan 03, 2000
1       Jan 10, 2000
2       Jan 17, 2000
3       Jan 24, 2000
4       Jan 31, 2000
            ...     
1094    Dec 21, 2020
1095    Dec 28, 2020
1096    Jan 04, 2021
1097    Jan 11, 2021
1098    Jan 18, 2021
Name: Date, Length: 1099, dtype: object

In [9]:
df_oil_futures = oil_futures['Price'][::-1].reset_index(drop = True)
df_oil_futures

0       24.22
1       28.02
2       28.20
3       27.22
4       28.82
        ...  
1094    48.23
1095    48.52
1096    52.24
1097    52.36
1098    52.27
Name: Price, Length: 1099, dtype: float64

In [10]:
df_gas = gas_df['Weekly U.S. Regular All Formulations Retail Gasoline Prices  (Dollars per Gallon)'].reset_index(drop = True)
df_gas

0       1.272
1       1.264
2       1.277
3       1.315
4       1.316
        ...  
1094    2.224
1095    2.243
1096    2.249
1097    2.317
1098    2.379
Name: Weekly U.S. Regular All Formulations Retail Gasoline Prices  (Dollars per Gallon), Length: 1099, dtype: float64

In [11]:
df = pd.DataFrame({'Week': df_date, 'oil_futures': df_oil_futures, 'gasoline': df_gas})
df

Unnamed: 0,Week,oil_futures,gasoline
0,"Jan 03, 2000",24.22,1.272
1,"Jan 10, 2000",28.02,1.264
2,"Jan 17, 2000",28.20,1.277
3,"Jan 24, 2000",27.22,1.315
4,"Jan 31, 2000",28.82,1.316
...,...,...,...
1094,"Dec 21, 2020",48.23,2.224
1095,"Dec 28, 2020",48.52,2.243
1096,"Jan 04, 2021",52.24,2.249
1097,"Jan 11, 2021",52.36,2.317


In [12]:
periods = (1,2,3,5,10)
for i in periods:
    df['of_chg_{}'.format(i)] = df['oil_futures'].pct_change(periods = i)
df['gas_chg_1'] = df['gasoline'].pct_change()
df = df[11:]
df

Unnamed: 0,Week,oil_futures,gasoline,of_chg_1,of_chg_2,of_chg_3,of_chg_5,of_chg_10,gas_chg_1
11,"Mar 20, 2000",28.02,1.529,-0.093497,-0.117758,-0.110758,-0.050491,0.000000,0.001310
12,"Mar 27, 2000",26.90,1.508,-0.039971,-0.129731,-0.153023,-0.113674,-0.046099,-0.013734
13,"Apr 03, 2000",25.04,1.503,-0.069145,-0.106353,-0.189906,-0.205332,-0.080088,-0.003316
14,"Apr 10, 2000",25.57,1.475,0.021166,-0.049442,-0.087438,-0.194899,-0.112769,-0.018629
15,"Apr 17, 2000",25.88,1.444,0.012124,0.033546,-0.037918,-0.162731,-0.120924,-0.021017
...,...,...,...,...,...,...,...,...,...
1094,"Dec 21, 2020",48.23,2.224,-0.017719,0.035645,0.042585,0.144247,0.179795,0.030584
1095,"Dec 28, 2020",48.52,2.243,0.006013,-0.011813,0.041872,0.065671,0.217566,0.008543
1096,"Jan 04, 2021",52.24,2.249,0.076669,0.083143,0.063951,0.129269,0.459626,0.002675
1097,"Jan 11, 2021",52.36,2.317,0.002297,0.079143,0.085631,0.124329,0.409801,0.030236


In [13]:
chgs = ['of_chg_{}'.format(i) for i in periods]
X = df[['oil_futures'] + chgs].values
X

array([[ 2.80200000e+01, -9.34972501e-02, -1.17758186e-01,
        -1.10758489e-01, -5.04913589e-02,  0.00000000e+00],
       [ 2.69000000e+01, -3.99714490e-02, -1.29731478e-01,
        -1.53022670e-01, -1.13673806e-01, -4.60992908e-02],
       [ 2.50400000e+01, -6.91449814e-02, -1.06352605e-01,
        -1.89906179e-01, -2.05331641e-01, -8.00881705e-02],
       ...,
       [ 5.22400000e+01,  7.66694147e-02,  8.31432718e-02,
         6.39511202e-02,  1.29269347e-01,  4.59625594e-01],
       [ 5.23600000e+01,  2.29709035e-03,  7.91426216e-02,
         8.56313498e-02,  1.24328967e-01,  4.09800754e-01],
       [ 5.22700000e+01, -1.71886937e-03,  5.74272588e-04,
         7.72877164e-02,  6.45621181e-02,  3.02516820e-01]])

In [14]:
y = (df['gas_chg_1'] > 0).values.astype('int')
y

array([1, 0, 0, ..., 1, 1, 1])

In [15]:
X.shape, y.shape

((1088, 6), (1088,))

In [16]:
## Create training, validation, and test sets ##
training_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ratio, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = validation_ratio/(validation_ratio + training_ratio), random_state = 42)
print("X_train shape:\t", X_train.shape)
print("X_test shape:\t", X_test.shape)
print("X_val shape:\t", X_val.shape)
print("y_train shape:\t", y_train.shape)
print("y_val shape:\t", y_val.shape)
print("y_test shape:\t", y_test.shape)

X_train shape:	 (760, 6)
X_test shape:	 (164, 6)
X_val shape:	 (164, 6)
y_train shape:	 (760,)
y_val shape:	 (164,)
y_test shape:	 (164,)


In [17]:
## Normalization ##
for i in range(X_train.shape[1]):
    X_train_mean = X_train[:, i]. mean()
    X_train_std = X_train[:, i].std()
    X_train[:, i] = (X_train[:,i] - X_train_mean) / X_train_std
    X_test[:,i] = (X_test[:,i] - X_train_mean) / X_train_std
    X_val[:, i] = (X_val[:,i] - X_train_mean) / X_train_std

print("X_train shape:\t", X_train.shape)
print("X_val shape:\t", X_val.shape)

X_train shape:	 (760, 6)
X_val shape:	 (164, 6)


In [18]:
## Random Forest Classifier##
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
for i in range(1,11):
    clf = RandomForestClassifier(max_depth=i, random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    score_clf = accuracy_score(y_val, y_pred)
    print('score_clf:', "%.4f"%score_clf)

score_clf: 0.7256
score_clf: 0.7256
score_clf: 0.7561
score_clf: 0.7683
score_clf: 0.7622
score_clf: 0.7561
score_clf: 0.7500
score_clf: 0.7561
score_clf: 0.7561
score_clf: 0.7439


In [19]:
# choose max_depth = 4
clf = RandomForestClassifier(max_depth=4, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
score_clf = accuracy_score(y_test, y_pred)
print('score_clf:', "%.4f"%score_clf)

score_clf: 0.7134


In [20]:
## Logistic Regression ##
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty = 'l1', solver='liblinear', random_state=0).fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
score_logistic = accuracy_score(y_test, y_pred)
print('score_logistic:', "%.4f"%score_logistic)

score_logistic: 0.7317


In [21]:
## Regression Cases ##
y_orig = df['gas_chg_1']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y_orig, test_size = test_ratio, random_state = 42)
X_train_orig, X_val_orig, y_train_orig, y_val_orig = train_test_split(X_train_orig, y_train_orig, test_size = validation_ratio/(validation_ratio + training_ratio), random_state = 42)
print("X_train_orig shape:\t", X_train_orig.shape)
print("X_test_orig shape:\t", X_test_orig.shape)
print("X_val_orig shape:\t", X_val_orig.shape)
print("y_train_orig shape:\t", y_train_orig.shape)
print("y_val_orig shape:\t", y_val_orig.shape)
print("y_test_orig shape:\t", y_test_orig.shape)

X_train_orig shape:	 (760, 6)
X_test_orig shape:	 (164, 6)
X_val_orig shape:	 (164, 6)
y_train_orig shape:	 (760,)
y_val_orig shape:	 (164,)
y_test_orig shape:	 (164,)


In [22]:
## Normalization for Regressor##
for i in range(X_train_orig.shape[1]):
    X_train_orig_mean = X_train_orig[:, i]. mean()
    X_train_orig_std = X_train_orig[:, i].std()
    X_train_orig[:, i] = (X_train_orig[:,i] - X_train_orig_mean) / X_train_orig_std
    X_test_orig[:,i] = (X_test_orig[:,i] - X_train_orig_mean) / X_train_orig_std
    X_val_orig[:, i] = (X_val_orig[:,i] - X_train_orig_mean) / X_train_orig_std

print(X_train_orig)
print(X_val_orig)

[[-1.34998705 -4.40716494 -3.71631597 -2.87382397 -2.22473558 -0.97949769]
 [-0.04610926  0.27636994  0.2657335   0.64796948  0.25708184  0.47708589]
 [ 0.43926444  0.28773908 -0.3962217  -0.42403584 -0.33483362  0.04182399]
 ...
 [-0.3490264  -0.04447913 -0.45091021 -0.3624747  -0.05623169  0.96392328]
 [-1.16713811 -0.21572707  0.12766699 -0.30773156  0.22679739 -0.03035669]
 [ 0.07788273 -0.2603349  -0.20036228 -0.07342509  0.42710864  0.59023273]]
[[-1.20755322e+00  5.08282539e-01  2.25402885e-01  2.18062079e-01
   3.84782108e-01  3.66018588e-01]
 [ 1.93634742e-01 -8.33074001e-01 -1.17375497e+00 -8.27445518e-01
  -1.01964358e+00 -7.26147469e-01]
 [ 3.42346654e-01 -4.59756180e-01 -9.35995447e-01 -2.06207811e+00
  -1.36772123e+00 -9.10551523e-01]
 [-1.40492021e+00 -2.51908037e-01 -8.37662899e-01 -7.04191888e-01
  -1.15651786e+00 -9.13827130e-01]
 [-1.30643291e+00 -1.75887433e+00 -1.57050984e+00 -1.26613554e+00
  -5.26934688e-01 -1.62413873e-01]
 [-5.89947547e-01  2.42163263e+00  1.02

In [23]:
## Multiple Linear Regression ##
from sklearn.linear_model import LinearRegression
multiple_reg = LinearRegression().fit(X_train_orig, y_train_orig)
# y_pred = multiple_reg.predict(X_test_orig)
score_multiple_reg = multiple_reg.score(X_test_orig, y_test_orig)
print('score_multiple_reg:', "%.4f"%score_multiple_reg)

score_multiple_reg: 0.4531


In [24]:
## Random Forest Regressor ##
clf_reg = RandomForestRegressor().fit(X_train_orig, y_train_orig)
y_pred = clf_reg.predict(X_test_orig)
score_clf_reg = clf_reg.score(X_test_orig, y_test_orig)
print('score_clf_reg:', "%.4f"%score_clf_reg)

score_clf_reg: 0.4618


In [25]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=1
# polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
poly = PolynomialFeatures(degree)
X_ = poly.fit_transform(X_train_orig)
X_val = poly.fit_transform(X_val_orig)
polyreg = LinearRegression().fit(X_, y_train_orig)
poly_reg_score = polyreg.score(X_val, y_val_orig)
# X_train = polyreg.fit_transform(X_train_orig)
# X_test = polyreg.fit_transform(X_test_orig)
# polyreg.fit(X_,y_train_orig)
# print(polyreg.score(X_test, y_test_orig))

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
gra_clf = GradientBoostingClassifier(n_estimators = 200, learning_rate=0.50, max_depth=5, random_state=0).fit(X_train, y_train)
gra_clf_score = gra_clf.score(X_test, y_test)

In [27]:
from sklearn.ensemble import GradientBoostingRegressor
gra_reg = GradientBoostingRegressor(random_state=0)
gra_reg.fit(X_train_orig, y_train_orig)
GradientBoostingRegressor(random_state=0)
gra_reg_score = gra_reg.score(X_test_orig, y_test_orig)

In [28]:
scores = [score_multiple_reg, score_logistic, score_clf, score_clf_reg, gra_reg_score, gra_clf_score, poly_reg_score]
scores

[0.453076677406538,
 0.7317073170731707,
 0.7134146341463414,
 0.46178915550796995,
 0.4512491747171764,
 0.7195121951219512,
 0.28649902321880516]