In [1]:
import yfinance as yp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import plotly.graph_objs as go
from plotly.offline import plot, init_notebook_mode
init_notebook_mode(connected = True)

In [2]:
# Create the feature matrix X
def create_feature_matrix(data, lags=[1, 2]):
    X = pd.DataFrame()
    for lag in lags:
        X[f'lag_{lag}'] = data['Close'].shift(lag)
    X = X.dropna()
    return X

In [3]:
# Extend feature matrix X with new columns
def extend_feature_matrix(X, new_columns):
    for col_name, col_data in new_columns.items():
        X[col_name] = col_data
    return X

In [4]:
# Split data into training and testing sets
def split_data(X, y):
    mid_point = len(X) // 2
    X_train, X_test = X[:mid_point], X[mid_point:]
    y_train, y_test = y[:mid_point], y[mid_point:]
    return X_train, X_test, y_train, y_test

In [5]:
ticker = 'GOOGL'
start_date = '2010-06-29'
end_date = '2014-06-24'
stock_data = yp.download(ticker, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [6]:
# Creating the matrix X
X = create_feature_matrix(stock_data)
y = stock_data['Close'][X.index]

In [7]:
# Adding additional features to X (example: (n-1 - n-2)^2)
X = extend_feature_matrix(X, {
    '(n-1 - n-2)^2': (X['lag_1'] - X['lag_2']) ** 2,
    'Volatility': (X['lag_1'] - X['lag_2']) / X['lag_2']
})

In [8]:
y.head()

Date
2010-07-01    10.998248
2010-07-02    10.924675
2010-07-06    10.912663
2010-07-07    11.266266
2010-07-08    11.425425
Name: Close, dtype: float64

In [9]:
# Splitting the data
X_train, X_test, y_train, y_test = split_data(X, y)

In [10]:
# Feature scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Machine learning model
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)

# Predict using the model
predicted_values = lm.predict(X_train_scaled)

In [12]:
# Plotting actual vs predicted values
trace0 = go.Scatter(
    x=X_train.index,
    y=y_train,
    mode='markers',
    name='Actual'
)
trace1 = go.Scatter(
    x=X_train.index,
    y=predicted_values,
    mode='lines',
    name='Predicted'
)

layout = go.Layout(
    title='Actual vs Predicted Stock Prices',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Stock Price')
)

fig = go.Figure(data=[trace0, trace1], layout=layout)
plot(fig)

'temp-plot.html'

In [13]:
stock_data.describe() 
print(stock_data.describe())

              Open         High          Low        Close    Adj Close  \
count  1003.000000  1003.000000  1003.000000  1003.000000  1003.000000   
mean     18.253879    18.398469    18.092760    18.249350    18.228429   
std       5.205511     5.229167     5.160075     5.197633     5.191675   
min      10.968719    11.068068    10.851602    10.912663    10.900153   
25%      14.613739    14.697072    14.450325    14.536912    14.520247   
50%      15.853854    15.966717    15.715465    15.838589    15.820432   
75%      21.879004    22.004004    21.746246    21.862864    21.837801   
max      30.700701    30.752752    30.495495    30.534784    30.499781   

             Volume  
count  1.003000e+03  
mean   1.008534e+08  
std    5.665283e+07  
min    2.258000e+07  
25%    7.017975e+07  
50%    8.832359e+07  
75%    1.164394e+08  
max    5.923990e+08  
