In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
import csv

In [2]:
aapl = pd.read_csv("Quarterly_Data_AAPL.csv")
aapl.insert(1, "Ticker", "AAPL")
aapl.head()

Unnamed: 0,Year,Ticker,Change_Net_Income,Change_Cash_Operations,Change_Cash_Investments,Change_Cash_Financials,Change_Cash_Equivalents
0,2014-12-27,AAPL,18024,33722,-21165,6923,5634
1,2015-03-28,AAPL,13569,19081,-22331,-15585,-4989
2,2015-06-25,AAPL,10677,14988,-11403,-2755,830
3,2015-09-26,AAPL,11124,13475,-1375,-6299,5801
4,2015-12-26,AAPL,18361,27463,-20450,-11444,-4431


In [3]:
tsla = pd.read_csv("Quarterly_Data_TSLA.csv")
tsla.insert(1, "Ticker", "TSLA")
tsla.head()

Unnamed: 0,Year,Ticker,Change_Net_Income,Change_Cash_Operations,Change_Cash_Investments,Change_Cash_Financials,Change_Cash_Equivalents
0,2015-03-31,TSLA,-154.181,-131.794,-432.344,186.156,-395.637
1,2015-06-30,TSLA,-184.227,-159.516,-422.837,218.351,-359.403
2,2015-09-30,TSLA,-229.858,-203.34,-404.09,893.978,275.363
3,2015-12-31,TSLA,-320.397,-29.849,-414.28,225.038,-229.128
4,2016-03-31,TSLA,-282.267,-249.605,-233.819,715.435,244.881


In [4]:
#Retrieve adjusted close for each date we have
def get_adj_close(df, tckr):
    dates = list(df["Year"])
    close = np.array([])
    start = ""

    for i in range(len(dates) - 1):
        start = dates[i]
        end = dates[i + 1]
        adj = yf.download(tckr, start = start, end = end)["Adj Close"]
        close = np.append(close, adj[0])

        if i is len(dates) - 2:
            close = np.append(close, adj[-1])
    return close

In [5]:
aapl["Adj_Close"] = get_adj_close(aapl, "AAPL")
tsla["Adj_Close"] = get_adj_close(tsla, "TSLA")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [6]:
aapl["%Change"] = aapl.Adj_Close.pct_change()
aapl.head()

Unnamed: 0,Year,Ticker,Change_Net_Income,Change_Cash_Operations,Change_Cash_Investments,Change_Cash_Financials,Change_Cash_Equivalents,Adj_Close,%Change
0,2014-12-27,AAPL,18024,33722,-21165,6923,5634,25.902882,
1,2015-03-28,AAPL,13569,19081,-22331,-15585,-4989,28.849669,0.113763
2,2015-06-25,AAPL,10677,14988,-11403,-2755,830,29.229227,0.013156
3,2015-09-26,AAPL,11124,13475,-1375,-6299,5801,25.893421,-0.114126
4,2015-12-26,AAPL,18361,27463,-20450,-11444,-4431,24.704502,-0.045916


In [7]:
tsla["%Change"] = tsla.Adj_Close.pct_change()
tsla.head()

Unnamed: 0,Year,Ticker,Change_Net_Income,Change_Cash_Operations,Change_Cash_Investments,Change_Cash_Financials,Change_Cash_Equivalents,Adj_Close,%Change
0,2015-03-31,TSLA,-154.181,-131.794,-432.344,186.156,-395.637,37.754002,
1,2015-06-30,TSLA,-184.227,-159.516,-422.837,218.351,-359.403,53.652,0.421094
2,2015-09-30,TSLA,-229.858,-203.34,-404.09,893.978,275.363,49.68,-0.074033
3,2015-12-31,TSLA,-320.397,-29.849,-414.28,225.038,-229.128,48.001999,-0.033776
4,2016-03-31,TSLA,-282.267,-249.605,-233.819,715.435,244.881,45.953999,-0.042665


In [8]:
df = pd.concat([aapl[1:], tsla[1:]])
df

Unnamed: 0,Year,Ticker,Change_Net_Income,Change_Cash_Operations,Change_Cash_Investments,Change_Cash_Financials,Change_Cash_Equivalents,Adj_Close,%Change
1,2015-03-28,AAPL,13569.0,19081.0,-22331.0,-15585.0,-4989.0,28.849669,0.113763
2,2015-06-25,AAPL,10677.0,14988.0,-11403.0,-2755.0,830.0,29.229227,0.013156
3,2015-09-26,AAPL,11124.0,13475.0,-1375.0,-6299.0,5801.0,25.893421,-0.114126
4,2015-12-26,AAPL,18361.0,27463.0,-20450.0,-11444.0,-4431.0,24.704502,-0.045916
5,2016-03-26,AAPL,10516.0,11601.0,-13660.0,6884.0,4825.0,24.459534,-0.009916
6,2016-06-25,AAPL,7796.0,10634.0,-4470.0,-9441.0,-3277.0,21.53211,-0.119684
7,2016-09-24,AAPL,9014.0,16126.0,-7397.0,-6482.0,2247.0,26.550533,0.233067
8,2016-12-31,AAPL,17891.0,27056.0,-19122.0,-12047.0,-4113.0,27.459938,0.034252
9,2017-04-01,AAPL,11029.0,12523.0,-14202.0,465.0,-1214.0,34.120541,0.242557
10,2017-07-01,AAPL,8717.0,8363.0,-3180.0,-1769.0,3414.0,34.213696,0.00273


In [9]:
#Split data (Train on 2020)
x_train = df[~df["Year"].str.contains("2020")].iloc[:,2:-1].values
y_train = df[~df["Year"].str.contains("2020")].iloc[:,-1:].values

x_test = df[df["Year"].str.contains("2020")].iloc[:,2:-1].values
y_test = df[df["Year"].str.contains("2020")].iloc[:,-1:].values

# print(x_train)
# print(y_train)
print(x_test)
print(y_test)

[[ 11249.          13311.           9013.         -20940.
    1384.             63.21250153]
 [ 11253.          16271.          -5165.         -19116.
   -8010.             89.99225616]
 [ 12673.          20576.           5531.         -21357.
    4750.            111.91937256]
 [    68.           -440.           -480.           2708.
    1764.            104.80000305]
 [   129.            964.           -566.            123.
     559.            215.96200562]
 [   369.           2400.          -1039.           4450.
    5897.            429.01000977]
 [   296.           3019.          -1047.           2692.
    4898.            694.7800293 ]]
[[-0.12385173]
 [ 0.42364649]
 [ 0.24365559]
 [ 0.25259965]
 [ 1.0607061 ]
 [ 0.98650688]
 [ 0.61949608]]


In [10]:
def cv_scores(model):
    scores = cross_val_score(model, x_train, y_train.ravel(), scoring='r2', cv=5)

    #Create KFold with 5 splits
    folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
    scores_r2 = cross_val_score(model, x_train, y_train.ravel(), scoring='r2', cv=folds)

    scores_mse = cross_val_score(model, x_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=5)

    print(scores)
    print(scores_r2)
    print(scores_mse, "\n\n")   

def cv_results(model, params):
    #Create a cross-validation scheme
    folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

    #Grid search
    model.fit(x_train, y_train.ravel())
    model_cv = GridSearchCV(estimator = model,
    param_grid = params,
    scoring= ['neg_root_mean_squared_error', 'r2'],
    cv = folds,
    verbose = 1,
    refit=False,
    return_train_score=True)
    model_cv.fit(x_train, y_train.ravel())

    cv_results = pd.DataFrame(model_cv.cv_results_)
    max_cv_res = cv_results.loc[cv_results["mean_test_r2"] == cv_results["mean_test_r2"].max()]
    print("Max: ", cv_results["mean_test_r2"].max(), "\nParams: ", dict(max_cv_res["params"]))
    print("\n\n")
    max_cv_res = cv_results.loc[cv_results["mean_test_neg_root_mean_squared_error"] == cv_results["mean_test_neg_root_mean_squared_error"].max()]
    print("Max: ", cv_results["mean_test_neg_root_mean_squared_error"].max(), "\nParams: ", dict(max_cv_res["params"]))


In [55]:
#Linear Regression
from sklearn.linear_model import LinearRegression
regressor_lr = LinearRegression()
regressor_lr.fit(x_train, y_train)
y_predicted_lr = regressor_lr.predict(x_test)

# round predictions
y_predicted_lr = np.where(y_predicted_lr < 0.5, 0, 1).reshape(-1,1)
y_predicted_lr

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1]])

In [56]:
#Linear Regression CV
cv_scores(regressor_lr)
parameters = [{'fit_intercept':[True, False], 'normalize':[True, False], 'positive':[True, False]}]
cv_results(regressor_lr, parameters)

[-4.43348532 -0.4960537  -0.03672486 -0.10959809  0.25443014]
[-6.46674238e+02 -2.83914316e+00 -5.77928986e-01 -1.74189369e-01
 -5.85980496e-01]
[-0.06574282 -0.03935814 -0.02931475 -0.03157729 -0.06823937] 


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Max:  -0.9046671040814134 
Params:  {2: {'fit_intercept': True, 'normalize': False, 'positive': True}}



Max:  -0.1933564899969625 
Params:  {0: {'fit_intercept': True, 'normalize': True, 'positive': True}, 2: {'fit_intercept': True, 'normalize': False, 'positive': True}}


In [57]:
regressor_lr = LinearRegression(fit_intercept=True, normalize=True, positive=True)
regressor_lr.fit(x_train, y_train)
y_predicted_lr = regressor_lr.predict(x_test)

# round predictions
# y_predicted_lr = np.where(y_predicted_lr < 0.4, 0, 1).reshape(-1,1)
y_predicted_lr

array([[0.20739282],
       [0.40393201],
       [0.57264513],
       [0.44441867],
       [1.25274674],
       [2.81003806],
       [4.74438069]])

In [58]:
print("LR RMSE:", np.sqrt(mean_squared_error(y_test, y_predicted_lr)))

LR RMSE: 1.716805958062523


In [15]:
#Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
x_poly = poly_reg.fit_transform(x_train)
regressor_pr = LinearRegression()
regressor_pr.fit(x_poly, y_train)
y_predicted_pr = regressor_pr.predict(poly_reg.fit_transform(x_test))

# round predictions
y_predicted_pr = np.where(y_predicted_pr < 0.5, 0, 1).reshape(-1,1)
y_predicted_pr

array([[0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1]])

In [16]:
#Polynomial Regression CV
cv_scores(regressor_pr)
parameters = [{'fit_intercept':[True, False], 'normalize':[True, False], 'positive':[True, False]}]
cv_results(regressor_pr, parameters)

[-4.43348532 -0.4960537  -0.03672486 -0.10959809  0.25443014]
[-6.46674238e+02 -2.83914316e+00 -5.77928986e-01 -1.74189369e-01
 -5.85980496e-01]
[-0.06574282 -0.03935814 -0.02931475 -0.03157729 -0.06823937] 


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Max:  -0.9046671040814134 
Params:  {2: {'fit_intercept': True, 'normalize': False, 'positive': True}}



Max:  -0.1933564899969625 
Params:  {0: {'fit_intercept': True, 'normalize': True, 'positive': True}, 2: {'fit_intercept': True, 'normalize': False, 'positive': True}}


In [59]:
poly_reg = PolynomialFeatures(degree = 4)
x_poly = poly_reg.fit_transform(x_train)
regressor_pr = LinearRegression(fit_intercept=True, normalize=False, positive=True)
regressor_pr.fit(x_poly, y_train)
y_predicted_pr = regressor_pr.predict(poly_reg.fit_transform(x_test))

# round predictions
# y_predicted_pr = np.where(y_predicted_pr < 0.5, 0, 1).reshape(-1,1)
y_predicted_pr

array([[-7.57009923e+05],
       [-5.76311220e+05],
       [-1.98989077e+06],
       [ 3.53756925e+02],
       [-3.91918092e+01],
       [ 5.28408855e+04],
       [ 4.09563267e+04]])

In [60]:
print("PR RMSE:", np.sqrt(mean_squared_error(y_test, y_predicted_pr)))

PR RMSE: 834037.9185468183


In [19]:
#SVR
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(x_train, y_train.ravel())

y_predicted_svr = regressor_svr.predict(x_test)

# round predictions
y_predicted_svr = np.where(y_predicted_svr < 0.5, 0, 1).reshape(-1,1)
y_predicted_svr

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [20]:
#SVR CV
cv_scores(regressor_svr)
parameters = [{'C': [1, 10, 100, 1000]}]
cv_results(regressor_svr, parameters)

[-0.7497897   0.19282689 -0.24171306  0.01028121 -0.07345004]
[-0.71643953 -2.12867935 -0.39592662 -1.17021791 -0.33423464]
[-0.0211717  -0.02123509 -0.03511106 -0.02816573 -0.09824908] 


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Max:  -0.9490996105515442 
Params:  {0: {'C': 1}}



Max:  -0.20841551696754884 
Params:  {0: {'C': 1}}


In [96]:
regressor_svr = SVR(kernel = 'rbf', C = 1)
regressor_svr.fit(x_train, y_train.ravel())

y_predicted_svr = regressor_svr.predict(x_test)

# round predictions
# y_predicted_svr = np.where(y_predicted_svr < 0.5, 0, 1).reshape(-1,1)
y_predicted_svr

array([0.01226232, 0.06188295, 0.00931888, 0.02468667, 0.02830903,
       0.00408458, 0.01299232])

In [97]:
print("SVR RMSE:", np.sqrt(mean_squared_error(y_test, y_predicted_svr)))

SVR RMSE: 0.6158735970647028


In [23]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state = 0)
regressor_dt.fit(x_train, y_train)

y_predicted_dt = regressor_dt.predict(x_test)

y_predicted_dt = np.where(y_predicted_dt < 0.5, 0, 1).reshape(-1,1)
y_predicted_dt

array([[0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [24]:
#DT CV
cv_scores(regressor_dt)
parameters = [{'max_depth':[3, None], 'max_features':[1, 2, 3, 4, 5], 'criterion': ['mse', 'mae']}]
cv_results(regressor_dt, parameters)

[-2.32113558  0.39853406 -0.57370609 -0.65601176  0.04418153]
[-0.48184859 -3.93387827 -2.41004808 -1.07759106 -1.81739359]
[-0.04018431 -0.01582335 -0.0444986  -0.0471273  -0.08748268] 


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Max:  -1.299166473423788 
Params:  {2: {'criterion': 'mse', 'max_depth': 3, 'max_features': 3}}



Max:  -0.22361634899614863 
Params:  {14: {'criterion': 'mae', 'max_depth': 3, 'max_features': 5}}


In [50]:
regressor_dt = DecisionTreeRegressor(random_state = 0, max_depth=3, max_features=5)
regressor_dt.fit(x_train, y_train)

y_predicted_dt = regressor_dt.predict(x_test)

# y_predicted_dt = np.where(y_predicted_dt < 0.5, 0, 1).reshape(-1,1)
y_predicted_dt

array([-0.08717618,  0.73674598,  0.73674598,  0.73674598,  0.73674598,
        0.73674598,  0.73674598])

In [83]:
temp = pd.DataFrame()
temp["Actual"] = pd.Series(y_test.reshape(-1,))
temp["DT Predicted"] = pd.Series(y_predicted_dt)
temp.head(2)

Unnamed: 0,Actual,DT Predicted
0,-0.123852,-0.087176
1,0.423646,0.736746


In [51]:
print("DT RMSE:", np.sqrt(mean_squared_error(y_test, y_predicted_dt)))

DT RMSE: 0.3290658602076817


In [67]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor_rf.fit(x_train, y_train.ravel())

y_predicted_rf = regressor_rf.predict(x_test)

# round predictions
y_predicted_rf = np.where(y_predicted_rf < 0.5, 0, 1).reshape(-1,1)
y_predicted_rf

array([[0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0]])

In [28]:
#RF CV
cv_scores(regressor_rf)
parameters = [{'n_estimators': [2, 4, 8], 'criterion': ['mse', 'mae']}]
cv_results(regressor_rf, parameters)

[-1.08455901 -0.16864935  0.27893421 -0.27191849  0.01558702]
[-0.36063027 -3.03440055  0.27948426 -1.33406246 -0.48212623]
[-0.02522226 -0.03074479 -0.02038908 -0.03619665 -0.09009984] 


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Max:  -0.9960506150141999 
Params:  {1: {'criterion': 'mse', 'n_estimators': 4}}



Max:  -0.20728198869039946 
Params:  {2: {'criterion': 'mse', 'n_estimators': 8}}


In [90]:
regressor_rf = RandomForestRegressor(n_estimators = 8, random_state = 0)
regressor_rf.fit(x_train, y_train.ravel())

y_predicted_rf = regressor_rf.predict(x_test)

# round predictions
# y_predicted_rf = np.where(y_predicted_rf < 0.5, 0, 1).reshape(-1,1)
y_predicted_rf

array([0.09342325, 0.52305916, 0.5537647 , 0.51881406, 0.5537647 ,
       0.51881406, 0.51881406])

In [91]:
print("RF RMSE:", np.sqrt(mean_squared_error(y_test, y_predicted_rf)))

RF RMSE: 0.31847675097950073


In [98]:
output = pd.DataFrame(df[df["Year"].str.contains("2020")])
output = output.drop(["Change_Net_Income", "Change_Cash_Operations", "Change_Cash_Investments", "Change_Cash_Financials", "Change_Cash_Equivalents", "Adj_Close"], axis = 1)
output = output.rename(columns = {"Year": "Date", "%Change":"y_true"})
output["y_true"] = np.where(output["y_true"] > 0.0, 1, 0)
output["y_predicted_dt"] = np.where(y_predicted_dt > 0.5, 1, 0)
output["y_predicted_rf"] = np.where(y_predicted_rf > 0.5, 1, 0)
output

Unnamed: 0,Date,Ticker,y_true,y_predicted_dt,y_predicted_rf
21,2020-03-28,AAPL,0,0,0
22,2020-06-27,AAPL,1,1,1
23,2020-09-26,AAPL,1,1,1
20,2020-03-31,TSLA,1,1,1
21,2020-06-30,TSLA,1,1,1
22,2020-09-30,TSLA,1,1,1
23,2020-12-31,TSLA,1,1,1


In [88]:
output.to_csv("regression.csv", index=False)