In [1]:
import yfinance as yp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
def create_feature_matrix(data, lags=[1, 2]):
    X = pd.DataFrame()
    for lag in lags:
        X[f'lag_{lag}'] = data['Close'].shift(lag)
    X = X.dropna()
    return X

In [3]:
# Extend feature matrix X with new columns
def extend_feature_matrix(X, new_columns):
    for col_name, col_data in new_columns.items():
        X[col_name] = col_data
    return X

In [4]:
# Split data into training and testing sets
def split_data(X, y):
    mid_point = len(X) // 2
    X_train, X_test = X[:mid_point], X[mid_point:]
    y_train, y_test = y[:mid_point], y[mid_point:]
    return X_train, X_test, y_train, y_test

In [5]:
ticker = 'GOOGL'
start_date = '2013-06-29'
end_date = '2018-06-25'
stock_data = yp.download(ticker, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [6]:
# Creating the matrix X
X = create_feature_matrix(stock_data)
y = stock_data['Close'][X.index]

In [7]:
# Adding additional features to X (example: (n-1 - n-2)^2)
X = extend_feature_matrix(X, {
    '(n-1 - n-2)^2': (X['lag_1'] - X['lag_2']) ** 2,
    'Volatility': (X['lag_1'] - X['lag_2']) / X['lag_2']
})

In [8]:
y.head()

Date
2013-07-03    22.182934
2013-07-05    22.359610
2013-07-08    22.649900
2013-07-09    22.653654
2013-07-10    22.672422
Name: Close, dtype: float64

In [9]:
# Splitting the data
X_train, X_test, y_train, y_test = split_data(X, y)

In [10]:
# Feature scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Machine learning model
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)

# Predict using the model
predicted_values_train = lm.predict(X_train_scaled)
predicted_values_test = lm.predict(X_test_scaled)

In [12]:
# Combine the predictions and actual values
predicted_values = np.concatenate([predicted_values_train, predicted_values_test])
all_dates = np.concatenate([X_train.index, X_test.index])
actual_values = np.concatenate([y_train, y_test])

In [13]:
# Plotting actual vs predicted values
trace0 = go.Scatter(
    x=all_dates,
    y=actual_values,
    mode='markers',
    name='Actual'
)
trace1 = go.Scatter(
    x=all_dates,
    y=predicted_values,
    mode='lines',
    name='Predicted'
)

layout = go.Layout(
    title='Actual vs Predicted Stock Prices',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Stock Price')
)

fig = go.Figure(data=[trace0, trace1], layout=layout)
plot(fig)

'temp-plot.html'

In [14]:
stock_data.describe()
print(stock_data.describe())

              Open         High          Low        Close    Adj Close  \
count  1255.000000  1255.000000  1255.000000  1255.000000  1255.000000   
mean     36.667752    36.951159    36.349563    36.660544    36.618517   
std       9.946374    10.034483     9.866555     9.957473     9.946058   
min      21.247997    21.406656    21.095596    21.193693    21.169397   
25%      27.907000    28.060500    27.665328    27.876750    27.844793   
50%      36.069500    36.475498    35.750500    36.073002    36.031647   
75%      42.612999    42.858000    42.376249    42.598251    42.549417   
max      59.400002    60.074501    59.202999    59.377998    59.309929   

             Volume  
count  1.255000e+03  
mean   4.526105e+07  
std    2.864047e+07  
min    1.041200e+07  
25%    2.871700e+07  
50%    3.651000e+07  
75%    5.183300e+07  
max    4.621933e+08  
