In [1]:
import yfinance as yp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode
init_notebook_mode(connected = True)

In [2]:
# Create the feature matrix X
def create_feature_matrix(data, lags=[1, 2]):
    X = pd.DataFrame()
    for lag in lags:
        X[f'lag_{lag}'] = data['Close'].shift(lag)
    X = X.dropna()
    return X

In [3]:
# Extend feature matrix X with new columns
def extend_feature_matrix(X, new_columns):
    for col_name, col_data in new_columns.items():
        X[col_name] = col_data
    return X

In [4]:
# Split data into training and testing sets
def split_data(X, y):
    mid_point = len(X) // 2
    X_train, X_test = X[:mid_point], X[mid_point:]
    y_train, y_test = y[:mid_point], y[mid_point:]
    return X_train, X_test, y_train, y_test

In [5]:
ticker = 'GOOGL'
start_date = '2010-06-29'
end_date = '2024-06-24'
stock_data = yp.download(ticker, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [6]:
# Creating the matrix X
X = create_feature_matrix(stock_data)
y = stock_data['Close'][X.index]

In [7]:
# Adding additional features to X (example: (n-1 - n-2)^2)
#X = extend_feature_matrix(X, {
 #   '(n-1 - n-2)^2': (X['lag_1'] - X['lag_2']) ** 2,
  #  'Volatility': (X['lag_1'] - X['lag_2']) / X['lag_2']
#})

In [8]:
X.head()

Unnamed: 0_level_0,lag_1,lag_2,(n-1 - n-2)^2,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-07-01,11.134885,11.367868,0.054281,-0.020495
2010-07-02,10.998248,11.134885,0.01867,-0.012271
2010-07-06,10.924675,10.998248,0.005413,-0.00669
2010-07-07,10.912663,10.924675,0.000144,-0.001099
2010-07-08,11.266266,10.912663,0.125035,0.032403


In [9]:
# Splitting the data
X_train, X_test, y_train, y_test = split_data(X, y)

In [10]:
# Feature scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Machine learning model
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)

# Predict using the model
predicted_values = lm.predict(X_train_scaled)

In [12]:
# Plotting actual vs predicted values
trace0 = go.Scatter(
    x=X_train.index,
    y=y_train,
    mode='markers',
    name='Actual'
)
trace1 = go.Scatter(
    x=X_train.index,
    y=predicted_values,
    mode='lines',
    name='Predicted'
)

layout = go.Layout(
    title='Actual vs Predicted Stock Prices',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Stock Price')
)

fig = go.Figure(data=[trace0, trace1], layout=layout)
plot(fig)

'temp-plot.html'

In [13]:
stock_data.describe() 
print(stock_data.describe())

              Open         High          Low        Close    Adj Close  \
count  3519.000000  3519.000000  3519.000000  3519.000000  3519.000000   
mean     58.894070    59.504379    58.301364    58.921968    58.921968   
std      42.391338    42.881369    41.943026    42.429693    42.429693   
min      10.968719    11.068068    10.851602    10.912663    10.912663   
25%      26.027750    26.361803    25.784034    26.039500    26.039500   
50%      47.275002    47.525002    46.917999    47.275002    47.275002   
75%      87.456749    88.478500    86.774502    87.740002    87.740002   
max     178.250000   180.850006   176.610001   179.630005   179.630005   

             Volume  
count  3.519000e+03  
mean   5.434959e+07  
std    4.456784e+07  
min    9.312000e+06  
25%    2.762400e+07  
50%    3.746800e+07  
75%    6.970608e+07  
max    5.923990e+08  
