In [130]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sbn
import datetime

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras

In [131]:
# import data TODO: 改成data importer
df = pd.read_csv('..\\Data\\cleansed_data\\test.csv')
df

Unnamed: 0,Date,gasoline,crude_futures,gdp
0,1-Jan-01,1.406,27.95,12999.56990
1,8-Jan-01,1.425,30.05,12293.34053
2,15-Jan-01,1.474,32.19,12708.81038
3,22-Jan-01,1.471,29.77,12287.22655
4,29-Jan-01,1.460,31.19,13353.26284
...,...,...,...,...
412,24-Nov-08,1.892,54.43,15451.90316
413,1-Dec-08,1.811,40.81,15362.37041
414,8-Dec-08,1.699,46.28,14770.91456
415,15-Dec-08,1.659,33.87,15966.69919


# Creating independent and dependent variables 

In [132]:
# calculate independent variables
periods = (1,2,3,5,10)
for i in periods:
    df['of_chg_{}'.format(i)] = df['crude_futures'].pct_change(periods = i)
    
# calculate dependent variable
df['gas_chg'] = df['gasoline'].pct_change()

# eliminate the empty rows
df = df[11:]
df

Unnamed: 0,Date,gasoline,crude_futures,gdp,of_chg_1,of_chg_2,of_chg_3,of_chg_5,of_chg_10,gas_chg
11,19-Mar-01,1.404,27.30,11982.22891,0.020942,-0.025348,-0.019397,-0.063786,-0.091514,-0.005666
12,26-Mar-01,1.404,26.29,13936.69786,-0.036996,-0.016829,-0.061407,-0.094697,-0.183287,0.000000
13,2-Apr-01,1.442,27.06,12809.32454,0.029289,-0.008791,0.011967,-0.028017,-0.091031,0.027066
14,9-Apr-01,1.500,28.25,13474.20064,0.043976,0.074553,0.034799,0.008568,-0.094261,0.040222
15,16-Apr-01,1.571,27.28,13236.22196,-0.034336,0.008130,0.037657,0.020194,-0.120851,0.047333
...,...,...,...,...,...,...,...,...,...,...
412,24-Nov-08,1.892,54.43,15451.90316,0.090126,-0.045757,-0.108290,-0.151520,-0.479388,-0.086873
413,1-Dec-08,1.811,40.81,15362.37041,-0.250230,-0.182656,-0.284537,-0.398171,-0.618206,-0.042812
414,8-Dec-08,1.699,46.28,14770.91456,0.134036,-0.149734,-0.073102,-0.241809,-0.507030,-0.061844
415,15-Dec-08,1.659,33.87,15966.69919,-0.268150,-0.170056,-0.377733,-0.406206,-0.564093,-0.023543


# Seperating training, validation, and test datasets

In [133]:
# create list of independent variable's name
ind_var_name = ['of_chg_{}'.format(i) for i in periods]+['gdp']

# extract the values to X
X = df[['crude_futures'] + ind_var_name].values
X

array([[ 2.73000000e+01,  2.09424084e-02, -2.53480900e-02, ...,
        -6.37860082e-02, -9.15141431e-02,  1.19822289e+04],
       [ 2.62900000e+01, -3.69963370e-02, -1.68287210e-02, ...,
        -9.46969697e-02, -1.83286735e-01,  1.39366979e+04],
       [ 2.70600000e+01,  2.92887029e-02, -8.79120879e-03, ...,
        -2.80172414e-02, -9.10312395e-02,  1.28093245e+04],
       ...,
       [ 4.62800000e+01,  1.34035776e-01, -1.49733603e-01, ...,
        -2.41808650e-01, -5.07030251e-01,  1.47709146e+04],
       [ 3.38700000e+01, -2.68150389e-01, -1.70056359e-01, ...,
        -4.06206171e-01, -5.64092664e-01,  1.59666992e+04],
       [ 3.77100000e+01,  1.13374668e-01, -1.85177182e-01, ...,
        -2.44742640e-01, -4.75156576e-01,  1.53330495e+04]])

In [134]:
# create y
y = (df['gas_chg'] > 0).values.astype('int')

# check correctness of X and y
X.shape, y.shape

((406, 7), (406,))

In [135]:
# create training, validation, and test sets
training_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_ratio, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = validation_ratio/(validation_ratio + training_ratio), random_state = 42)


print("X_train shape:\t", X_train.shape)
print("X_test shape:\t", X_test.shape)
print("X_val shape:\t", X_val.shape)
print("y_train shape:\t", y_train.shape)
print("y_val shape:\t", y_val.shape)
print("y_test shape:\t", y_test.shape)

X_train shape:	 (284, 7)
X_test shape:	 (61, 7)
X_val shape:	 (61, 7)
y_train shape:	 (284,)
y_val shape:	 (61,)
y_test shape:	 (61,)


# Principal Component Analysis (PCA)

In [136]:
# normalization 
for i in range(X_train.shape[1]):
    X_train_mean = X_train[:, i]. mean()
    X_train_std = X_train[:, i].std()
    X_train[:, i] = (X_train[:,i] - X_train_mean) / X_train_std
    X_test[:,i] = (X_test[:,i] - X_train_mean) / X_train_std
    X_val[:, i] = (X_val[:,i] - X_train_mean) / X_train_std

print("X_train shape:\t", X_train.shape)
print("X_val shape:\t", X_val.shape)

X_train shape:	 (284, 7)
X_val shape:	 (61, 7)


In [137]:
# fit on training set
scaler = StandardScaler()
scaler.fit(X_train)

# apply transform to training and test set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# set PCA value
pca = PCA(0.95)

# fit PCA training set
pca.fit(X_train)

# apply transform to training and test set
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
X_val = pca.transform(X_val)
print("X_train shape:\t", X_train.shape)
print("X_test shape:\t", X_test.shape)
print("X_val shape:\t", X_val.shape)
print("y_train shape:\t", y_train.shape)
print("y_val shape:\t", y_val.shape)
print("y_test shape:\t", y_test.shape)

X_train shape:	 (284, 6)
X_test shape:	 (61, 6)
X_val shape:	 (61, 6)
y_train shape:	 (284,)
y_val shape:	 (61,)
y_test shape:	 (61,)


# Random forest classifier

In [138]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
# TODO: 这里我写的range（1,11）但是我不知道11是哪里来的
for i in range (1, 11):
    clf = RandomForestClassifier(max_depth=i, random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    score_clf = accuracy_score(y_val, y_pred)
    print('score_clf:', "%.4f"%score_clf)

score_clf: 0.6885
score_clf: 0.6721
score_clf: 0.6885
score_clf: 0.6721
score_clf: 0.6885
score_clf: 0.7213
score_clf: 0.7049
score_clf: 0.7377
score_clf: 0.7049
score_clf: 0.7377


In [139]:
# choose max_depth = 4
clf = RandomForestClassifier(max_depth=4, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
score_clf = accuracy_score(y_test, y_pred)
print('score_clf:', "%.4f"%score_clf)

score_clf: 0.7541


# Logistic regression

In [140]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty = 'l1', solver='liblinear', random_state=0).fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
score_logistic = accuracy_score(y_test, y_pred)
print('score_logistic:', "%.4f"%score_logistic)

score_logistic: 0.7705


In [141]:
# regression cases
y_orig = df['gas_chg']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y_orig, test_size = test_ratio, random_state = 42)
X_train_orig, X_val_orig, y_train_orig, y_val_orig = train_test_split(X_train_orig, y_train_orig, test_size = validation_ratio/(validation_ratio + training_ratio), random_state = 42)
print("X_train_orig shape:\t", X_train_orig.shape)
print("X_test_orig shape:\t", X_test_orig.shape)
print("X_val_orig shape:\t", X_val_orig.shape)
print("y_train_orig shape:\t", y_train_orig.shape)
print("y_val_orig shape:\t", y_val_orig.shape)
print("y_test_orig shape:\t", y_test_orig.shape)

X_train_orig shape:	 (284, 7)
X_test_orig shape:	 (61, 7)
X_val_orig shape:	 (61, 7)
y_train_orig shape:	 (284,)
y_val_orig shape:	 (61,)
y_test_orig shape:	 (61,)


In [142]:
# TODO: 这个地方PCA怎么弄还是不弄？
# normalization for regressor
for i in range(X_train_orig.shape[1]):
    X_train_orig_mean = X_train_orig[:, i]. mean()
    X_train_orig_std = X_train_orig[:, i].std()
    X_train_orig[:, i] = (X_train_orig[:,i] - X_train_orig_mean) / X_train_orig_std
    X_test_orig[:,i] = (X_test_orig[:,i] - X_train_orig_mean) / X_train_orig_std
    X_val_orig[:, i] = (X_val_orig[:,i] - X_train_orig_mean) / X_train_orig_std

print("X_train shape:\t", X_train.shape)
print("X_val shape:\t", X_val.shape)

X_train shape:	 (284, 6)
X_val shape:	 (61, 6)


# Multiple linear regression

In [144]:
from sklearn.linear_model import LinearRegression
multiple_reg = LinearRegression().fit(X_train_orig, y_train_orig)
# y_pred = multiple_reg.predict(X_test_orig)
score_multiple_reg = multiple_reg.score(X_test_orig, y_test_orig)
print('score_multiple_reg:', "%.4f"%score_multiple_reg)

score_multiple_reg: 0.4886


# Random forest regressor

In [145]:
clf_reg = RandomForestRegressor().fit(X_train_orig, y_train_orig)
y_pred = clf_reg.predict(X_test_orig)
score_clf_reg = clf_reg.score(X_test_orig, y_test_orig)
print('score_clf_reg:', "%.4f"%score_clf_reg)

score_clf_reg: 0.4118


# Multiple polynomial regression

In [146]:
# TODO: 这个score不太对劲
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=5
# polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
poly = PolynomialFeatures(degree)
X_ = poly.fit_transform(X_train_orig)
X_val = poly.fit_transform(X_val_orig)
polyreg = LinearRegression().fit(X_, y_train_orig)
poly_reg_score = polyreg.score(X_val, y_val_orig)
poly_reg_score

-17856.07324108401

# Gradient boosting classifier

In [147]:
from sklearn.ensemble import GradientBoostingClassifier
gra_clf = GradientBoostingClassifier(n_estimators = 200, learning_rate=0.50, max_depth=5, random_state=0).fit(X_train, y_train)
gra_clf_score = gra_clf.score(X_test, y_test)
gra_clf_score

0.7213114754098361

# Gradient boosting regressor 

In [148]:
from sklearn.ensemble import GradientBoostingRegressor
gra_reg = GradientBoostingRegressor(random_state=0)
gra_reg.fit(X_train_orig, y_train_orig)
GradientBoostingRegressor(random_state=0)
gra_reg_score = gra_reg.score(X_test_orig, y_test_orig)
gra_reg_score

0.32641094272503834

# Support vector classification

In [149]:
# Try different kernels
kernels = ('linear', 'poly', 'rbf')


for k in kernels:
    svm_clf = SVC(kernel=k).fit(X_train, y_train)
    y_pred = svm_clf.predict(X_test)
    vars()[f'score_svc_{k}']= accuracy_score(y_test, y_pred)
    
print(score_svc_linear)
print(score_svc_poly)
print(score_svc_rbf)

0.7213114754098361
0.6557377049180327
0.6721311475409836


# Gaussian Naive Bayes

In [150]:
gnb_clf = GaussianNB().fit(X_train, y_train)
y_pred = gnb_clf.predict(X_test)
gnb_score = accuracy_score(y_test, y_pred)
print(gnb_score)

0.7049180327868853


# Neuro networks

In [151]:
#TODO: 看不懂是什么问题，好像是dataset的大小不对，但是我不明白
model = keras.models.Sequential()
model.add(keras.layers.Dense(10, input_dim=6, activation='relu'))
model.add(keras.layers.Dense(5, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

Epoch 1/50
1/9 [==>...........................] - ETA: 2s - loss: 0.6108 - accuracy: 0.6562

ValueError: in user code:

    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1233 test_function  *
        return step_function(self, iterator)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1224 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1217 run_step  **
        outputs = model.test_step(data)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:1183 test_step
        y_pred = self(x, training=False)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\lemon\anaconda3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:255 assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer sequential_6 is incompatible with the layer: expected axis -1 of input shape to have value 6 but received input with shape (None, 792)


In [116]:
loss, neuro_score = model.evaluate(X_test, y_test)
print(neuro_score)

0.5245901346206665


# Summary

In [117]:
scores = [score_multiple_reg, poly_reg_score, score_logistic, score_clf, score_clf_reg, gra_reg_score, gra_clf_score,gnb_score, score_svc_linear, score_svc_poly, score_svc_rbf, neuro_score]
models = ['Multiple linear regression', 'Multiple polynomial regression', 'Logistic regression', 'Random forest classifier', 'Random forest regressor', 'Gradient boosting classifier', 'Gradient boosting regressor','Gaussian naive Bayes', 'SVC linear', 'SVC polynomial', 'SVC RBF', 'Neuro network']
result = pd.DataFrame(columns = ['Model','Score'])
result['Model'] = models
result['Score'] = scores
result

Unnamed: 0,Model,Score
0,Multiple linear regression,0.488581
1,Multiple polynomial regression,-17856.073241
2,Logistic regression,0.754098
3,Random forest classifier,0.786885
4,Random forest regressor,0.393384
5,Gradient boosting classifier,0.326411
6,Gradient boosting regressor,0.721311
7,Gaussian naive Bayes,0.704918
8,SVC linear,0.721311
9,SVC polynomial,0.639344
