## Linear Logistics Models
### for 
## Predicting Stock Market Movements
The moves UP or Down are classified only

In [1]:
    """
    By Al Sabawi
    2023-03-11 
    """
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import yfinance as yf
import matplotlib.pyplot as plt

# Load SPY stock price data
# df = pd.read_csv('SPY.csv', parse_dates=['Date'], index_col='Date')
df_in = yf.download('SPY',period='max')


[*********************100%***********************]  1 of 1 completed


### Latest SPY ETF Dataset from Yahoo Finance  

In [2]:
# Create an isolated dataframe so we can manipulate without changing the original data
df = df_in.copy()

# Drop all the columns not relevant for the daily predictions
df = df.drop(columns=['Open','High','Low','Adj Close','Volume'])

# Create a 'Returns' column for the % changes in price and add it to Dataframe
df['ret'] = df['Close'].pct_change()   

# The 'Target' is what we will predict in LogisticRegression 
# The Target is 1 or 0, 1 meaning the stock went up TODAY from Yesterday's price
# However, since we need to predict it a day ahead so we can buy it, we need to shift() back in time!
# so we get the signal to buy before the day the price goes up
## The following line says: If tomorrow's return 'df['ret'].shift(-1)' is above 0, record a buy signal (1) 
# today so we buy it at the open tomorrow, else record 'no buy' signal (0)
df['Target'] = np.where(df['ret'].shift(-1) > 0, 1, 0)

### Creating LAGGED Dataset

In [3]:
# A lagged dataset in Timeseries is based on the assumption that the predicted value 'Target' 
# depends on the prices of 1 or more days before.  In this case I am taking into account 5 days before
# We will add 5 new columns recording the change in price for the past 5 days in each row

# Create lagged features for the past 5 days
def create_lagged_features(df, lag):
    features = df.copy()
    for i in range(1, lag+1):
        features[f'ret_lag{i}'] = features['ret'].shift(i)
    features.dropna(inplace=True)
    features.drop(columns=['Close'],inplace=True)
    features.drop(columns=['ret'],inplace=True)
    return features

df_lagged = create_lagged_features(df, 5)
df_lagged.tail(6)

Unnamed: 0_level_0,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-03-03,1,0.007777,-0.003836,-0.003696,0.003406,-0.010682
2023-03-06,0,0.016038,0.007777,-0.003836,-0.003696,0.003406
2023-03-07,1,0.000693,0.016038,0.007777,-0.003836,-0.003696
2023-03-08,0,-0.015329,0.000693,0.016038,0.007777,-0.003836
2023-03-09,0,0.001632,-0.015329,0.000693,0.016038,0.007777
2023-03-10,0,-0.01845,0.001632,-0.015329,0.000693,0.016038


### Training Set and Batches
##### We'll need to divide the historical data in smaller batches but we need to make sure each batch is balanced as much as possible

In [4]:
# Split data into train and test sets using a stratified 80-20 split
df_lagged.dropna(inplace=True)
train_df, test_df = train_test_split(df_lagged, test_size=0.2, random_state=42, stratify=df_lagged['Target'])

# ##############################################################
# About Batches:    For a LogisticRegression Model, we need to 
#                   balance the training data with rows that have 
#                   equal 'Target' of 1 (buy) and 0 (no buy). 
#                   Otherwise the model will become bias for the outcome
#                   that we feed it more of.  So for that we made each 
#                   row 'self-contained' with all the previous 
#                   data (last return plu 5 previous returns) so that 
#                   we can shuffle the rows and feed them into 
#                   the model as batches of rows. Each bach is an equal 
#                   mix of outcome 1 and 0.  This was the concentration 
#                   of 1's and 0's dont in a series (long up trends or 
#                   long down trends) don't bias the next outcome
# ###############################################################

# Split train data into batches with balanced target values
batches_count = 128  # We can start from 32 to go up then see the accuracy the effect of accuracy
batch_size = len(train_df) // batches_count
train_batches = []
for i in range(0, len(train_df), batch_size):
    batch = train_df.iloc[i:i+batch_size]
    num_positives = len(batch[batch['Target'] == 1])
    if num_positives == batch_size // 2:
        train_batches.append(batch)
    elif num_positives > batch_size // 2:
        excess_positives = num_positives - batch_size // 2
        batch = batch.drop(batch[batch['Target'] == 1].sample(excess_positives).index)
        train_batches.append(batch)
    else:
        missing_positives = batch_size // 2 - num_positives
        num_negatives = len(batch[batch['Target'] == 0])
        if missing_positives > num_negatives:
            batch = batch.drop(batch[batch['Target'] == 0].index)
            missing_positives -= num_negatives
            excess_positives = missing_positives - len(batch[batch['Target'] == 1])
            batch = pd.concat([batch, batch[batch['Target'] == 1].sample(excess_positives, replace=True)])
        else:
            batch = batch.drop(batch[batch['Target'] == 0].sample(missing_positives, replace=False).index)
        train_batches.append(batch)
        
print(f"Number of Batches = {batches_count}")
print(f"Rows in first batch = {len(train_batches[0])}")
train_batches[0].tail(len(train_batches[0]))


Number of Batches = 128
Rows in first batch = 42


Unnamed: 0_level_0,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-25,0,-0.003322,-0.005416,0.008104,0.001066,0.005426
2007-02-02,1,0.005983,0.006723,0.005209,-0.000563,-0.000914
2011-10-21,1,0.004376,-0.011829,0.019546,-0.019091,0.017094
2018-03-01,1,-0.01013,-0.012487,0.011612,0.015939,0.001296
1993-04-12,1,-0.001409,0.003536,-0.002821,0.004961,-0.020819
2013-04-02,0,-0.003957,0.003073,0.0,0.008003,-0.004177
2001-03-23,1,-0.010155,-0.016988,-0.026843,0.020346,-0.022689
2020-11-09,0,-0.000228,0.019503,0.022349,0.017656,0.011208
2018-05-03,1,-0.006717,0.001777,-0.007691,0.000939,0.010166
2010-12-13,1,0.005818,0.003894,0.003664,0.00057,-0.001058


In [5]:
# train_df dataframe is the unbatched dataset
train_df.tail(5)

Unnamed: 0_level_0,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1996-08-19,0,0.008487,-0.001882,0.003542,-0.007964,0.007315
2001-03-14,1,0.016429,-0.042802,-0.029578,0.001103,0.007138
2002-10-09,1,0.01567,-0.020668,-0.018345,-0.010102,-0.029981
2007-05-01,1,-0.008293,-0.000802,0.001137,0.009182,0.000405
2022-01-26,0,-0.012209,0.004247,-0.019631,-0.011068,-0.010384


### model_1 : Create the 1st Model

In [6]:
# Create logistic regression model
model_1 = LogisticRegression(class_weight='balanced')

### Testing the model (model_1)

In [7]:

# Train model on the first batch of the training data
X_train = train_batches[0].drop(columns=['Target'])
y_train = train_batches[0]['Target']
# print("X_train: \n",X_train.tail(5))
# print("y_train: \n",y_train.tail(5))
# ******************************************************
model_1.fit(X_train, y_train)

# Evaluate the model on the test set and print the test accuracy
X_test = test_df.drop(columns=['Target'])
y_test = test_df['Target']
y_pred = model_1.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {test_accuracy:.3f}')

Test accuracy: 0.517


In [8]:
# Show predictions
y_pred

array([1, 1, 0, ..., 1, 1, 1])

### Train the remaining batches on model_1

In [9]:
print("BatchSize",batch_size)
for j in range(1,batches_count):
    # print("Batch#",j,train_batches[j])
    X_train = train_batches[j].drop(columns=['Target'])
    # print(X_train.columns)
    y_train = train_batches[j]['Target']
    model_1.fit(X_train, y_train)

# X_train = train_df.drop(columns=['Target'])
# y_train = train_df['Target']

# #***************************
# model.fit(X_train, y_train)
# X_train

BatchSize 47


### Predictions from model_1

In [10]:
# Make predictions for next 5 days
first_value = df_in['Close'][0]
df_pred = df_in.copy()
print(first_value)

43.9375


### Use the whole dataset and removing the Non-Feature columns
Non-feature columns are columns not used for training

In [11]:
# Generate the Retuen columns and the Target column to compare with later
df_pred['ret'] = df_pred['Close'].pct_change() # Daily return
df_pred['Target'] = np.where(df_pred['ret'].shift(-1) > 0.0, 1, 0) # Target column is 1 IF the next day close price is higher
df_pred = create_lagged_features(df_pred, 5) # Create the Lagged columns for the past 5 days

# df_pred.drop(columns=['Target'],inplace=True)
df_pred = df_pred.drop(columns=['Open','High','Low','Adj Close','Volume']) # Remove non-feature columns
if 'Predicted' in df_pred.columns:
    df_pred = df_pred.drop('Predicted') # Remove the predicted column in case its leftover from previous runs
df_pred.tail(5)

Unnamed: 0_level_0,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-03-06,0,0.016038,0.007777,-0.003836,-0.003696,0.003406
2023-03-07,1,0.000693,0.016038,0.007777,-0.003836,-0.003696
2023-03-08,0,-0.015329,0.000693,0.016038,0.007777,-0.003836
2023-03-09,0,0.001632,-0.015329,0.000693,0.016038,0.007777
2023-03-10,0,-0.01845,0.001632,-0.015329,0.000693,0.016038


In [12]:
# Create a separate dataframe WITHOUT target column for prediction only
data_no_target = df_pred.copy()
if('Target' in data_no_target.columns):
    data_no_target = data_no_target.drop(columns=['Target'])

# Check to see if we have the right no. of columns for the prediction call
print('column count =',len(data_no_target.columns),':',data_no_target.columns)
# Check that we have the Target data still available
df_pred['Target'].tail(5)

column count = 5 : Index(['ret_lag1', 'ret_lag2', 'ret_lag3', 'ret_lag4', 'ret_lag5'], dtype='object')


Date
2023-03-06    0
2023-03-07    1
2023-03-08    0
2023-03-09    0
2023-03-10    0
Name: Target, dtype: int64

In [13]:
predictions_1 = model_1.predict(data_no_target)

In [14]:
# Make predictions from model_1 into a DataFrame along with the actual Target column from before to compare
df_pred1 = pd.DataFrame(index=df_pred.index)
df_pred1['Predicted']=  predictions_1
df_pred1['Target'] = df_pred['Target'].copy()
df_pred1[['Predicted','Target']].tail(50)

Unnamed: 0_level_0,Predicted,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-12-28,0,1
2022-12-29,1,0
2022-12-30,0,0
2023-01-03,0,1
2023-01-04,1,0
2023-01-05,0,1
2023-01-06,1,0
2023-01-09,1,1
2023-01-10,0,1
2023-01-11,1,1


### Check Prediction Results for model_1

In [15]:
eq=neq=pup=tup=pdown=tdown=0
for i in range(len(df_pred1['Predicted'])):
    if df_pred1['Predicted'].iloc[i] == df_pred1['Target'].iloc[i]:
        eq+=1
        if df_pred1['Predicted'].iloc[i] == 1:
            pup+=1
            tup+=1
        else:
            pdown+=1
            tdown+=1
    else:
        neq+=1
        if df_pred1['Predicted'].iloc[i] == 1:
            pup+=1
            tdown+=1
        if df_pred1['Target'].iloc[i] == 1:
            pdown+=1
            tup+=1
      

print("----Results from Predictions using model_1----")  
print(f"Equal Values = {eq} ({round(100*eq/(eq+neq),2)}%) \n\
Not Equal = {neq} ({round(100*neq/(eq+neq),2)}%),  \n\
Total = {eq+neq} rows")
print(f"Predicted UPs : {round(100*pup/(eq+neq),2)}% vs Actual UPs : {round(100*tup/(eq+neq),2)}%  ")
print(f"Predicted Downs : {round(100*pdown/(eq+neq),2)}% vs Actual Downs : {round(100*tdown/(eq+neq),2)}%  ")

----Results from Predictions using model_1----
Equal Values = 3887 (51.3%) 
Not Equal = 3690 (48.7%),  
Total = 7577 rows
Predicted UPs : 54.01% vs Actual UPs : 53.27%  
Predicted Downs : 45.99% vs Actual Downs : 46.73%  


### model_2: Creating the second model
This model will be trained without batches or manual re-balancing of outcomes 

In [16]:
# Creating model_2 and training it on the whole dataset on one go. No batching or rebalancing 
model_2 = LogisticRegression(class_weight='balanced')
print('Check columns : ',data_no_target.columns)
model_2.fit(data_no_target, df_pred['Target'])
    
df_pred2 = pd.DataFrame(index=df_pred.index)
df_pred2['Predicted']=  model_2.predict(data_no_target)
df_pred2['Target'] = df_pred['Target'].copy()
df_pred2[['Predicted','Target']].tail(50)

Check columns :  Index(['ret_lag1', 'ret_lag2', 'ret_lag3', 'ret_lag4', 'ret_lag5'], dtype='object')


Unnamed: 0_level_0,Predicted,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-12-28,0,1
2022-12-29,1,0
2022-12-30,1,0
2023-01-03,0,1
2023-01-04,1,0
2023-01-05,0,1
2023-01-06,0,0
2023-01-09,1,1
2023-01-10,0,1
2023-01-11,0,1


In [17]:
# Reassembling the original dataset with the Predicted and Target columns added
df_in2 = df_in.copy()
df_in2['Predicted Buy'] = df_pred2['Predicted']
df_in2['Correct Buy'] = df_pred2['Target']
df_in2.dropna()
df_in2.tail(20)
# df_pred['Target'].tail(50)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Predicted Buy,Correct Buy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-10,405.859985,408.440002,405.01001,408.040009,408.040009,70769700,1.0,1.0
2023-02-13,408.720001,412.970001,408.23999,412.829987,412.829987,64913500,1.0,0.0
2023-02-14,411.23999,415.049988,408.51001,412.640015,412.640015,88389300,1.0,1.0
2023-02-15,410.350006,414.059998,409.470001,413.980011,413.980011,61555700,1.0,0.0
2023-02-16,408.790009,412.910004,408.140015,408.279999,408.279999,76431500,0.0,0.0
2023-02-17,406.059998,407.51001,404.049988,407.26001,407.26001,89257800,0.0,0.0
2023-02-21,403.059998,404.160004,398.820007,399.089996,399.089996,82655900,1.0,0.0
2023-02-22,399.519989,401.130005,397.019989,398.540009,398.540009,83742300,1.0,1.0
2023-02-23,401.559998,402.200012,396.25,400.660004,400.660004,96242400,1.0,0.0
2023-02-24,395.420013,397.25,393.640015,396.380005,396.380005,108194400,1.0,1.0


In [18]:
df_in2.tail(5)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Predicted Buy,Correct Buy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-03-06,405.049988,407.450012,404.01001,404.470001,404.470001,72795900,0.0,0.0
2023-03-07,404.420013,404.670013,397.630005,398.269989,398.269989,108310600,0.0,1.0
2023-03-08,398.390015,399.709991,396.589996,398.920013,398.920013,74746600,0.0,0.0
2023-03-09,399.73999,401.480011,390.529999,391.559998,391.559998,111945300,0.0,0.0
2023-03-10,390.98999,393.160004,384.320007,385.910004,385.910004,189105300,1.0,0.0


### Check Prediction Results for model_2

In [19]:
eq=neq=pup=tup=pdown=tdown=0
for i in range(len(df_pred1['Predicted'])):
    if df_pred1['Predicted'].iloc[i] == df_pred1['Target'].iloc[i]:
        eq+=1
        if df_pred1['Predicted'].iloc[i] == 1:
            pup+=1
            tup+=1
        else:
            pdown+=1
            tdown+=1
    else:
        neq+=1
        if df_pred1['Predicted'].iloc[i] == 1:
            pup+=1
            tdown+=1
        if df_pred1['Target'].iloc[i] == 1:
            pdown+=1
            tup+=1
      

print("----Results from Predictions using model_2----")  
print(f"Equal Values = {eq} ({round(100*eq/(eq+neq),2)}%) \n\
Not Equal = {neq} ({round(100*neq/(eq+neq),2)}%),  \n\
Total = {eq+neq} rows")
print(f"Predicted UPs : {round(100*pup/(eq+neq),2)}% vs Actual UPs : {round(100*tup/(eq+neq),2)}%  ")
print(f"Predicted Downs : {round(100*pdown/(eq+neq),2)}% vs Actual Downs : {round(100*tdown/(eq+neq),2)}%  ")

----Results from Predictions using model_2----
Equal Values = 3887 (51.3%) 
Not Equal = 3690 (48.7%),  
Total = 7577 rows
Predicted UPs : 54.01% vs Actual UPs : 53.27%  
Predicted Downs : 45.99% vs Actual Downs : 46.73%  


### Predicting the Stock Market for the next 5 Days 
We'll use model_2 and follow the same procedures of no batches or re-balancing 

In [20]:
df_pred = df_in.copy()
# df_pred['Close'].pct_change()
df_pred['ret'] = df_pred['Close'].pct_change()
df_pred['Target'] = np.where(df_pred['ret'].shift(-1) > 0.0, 1, 0)
df_pred = create_lagged_features(df_pred, 5)
df_pred

Unnamed: 0_level_0,Open,High,Low,Adj Close,Volume,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-02-08,44.968750,45.125000,44.906250,25.810114,596100,0,-0.000694,0.004184,0.010571,0.002119,0.007112
1993-02-09,44.812500,44.812500,44.562500,25.630751,122100,1,0.000000,-0.000694,0.004184,0.010571,0.002119
1993-02-10,44.656250,44.750000,44.531250,25.666628,379600,1,-0.006949,0.000000,-0.000694,0.004184,0.010571
1993-02-11,44.781250,45.125000,44.781250,25.792175,19500,0,0.001400,-0.006949,0.000000,-0.000694,0.004184
1993-02-12,44.875000,44.875000,44.593750,25.594883,42500,0,0.004892,0.001400,-0.006949,0.000000,-0.000694
...,...,...,...,...,...,...,...,...,...,...,...
2023-03-06,405.049988,407.450012,404.010010,404.470001,72795900,0,0.016038,0.007777,-0.003836,-0.003696,0.003406
2023-03-07,404.420013,404.670013,397.630005,398.269989,108310600,1,0.000693,0.016038,0.007777,-0.003836,-0.003696
2023-03-08,398.390015,399.709991,396.589996,398.920013,74746600,0,-0.015329,0.000693,0.016038,0.007777,-0.003836
2023-03-09,399.739990,401.480011,390.529999,391.559998,111945300,0,0.001632,-0.015329,0.000693,0.016038,0.007777


In [21]:
df_pred = df_pred.drop(columns=['Open','High','Low','Adj Close','Volume'])
df_pred

Unnamed: 0_level_0,Target,ret_lag1,ret_lag2,ret_lag3,ret_lag4,ret_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-02-08,0,-0.000694,0.004184,0.010571,0.002119,0.007112
1993-02-09,1,0.000000,-0.000694,0.004184,0.010571,0.002119
1993-02-10,1,-0.006949,0.000000,-0.000694,0.004184,0.010571
1993-02-11,0,0.001400,-0.006949,0.000000,-0.000694,0.004184
1993-02-12,0,0.004892,0.001400,-0.006949,0.000000,-0.000694
...,...,...,...,...,...,...
2023-03-06,0,0.016038,0.007777,-0.003836,-0.003696,0.003406
2023-03-07,1,0.000693,0.016038,0.007777,-0.003836,-0.003696
2023-03-08,0,-0.015329,0.000693,0.016038,0.007777,-0.003836
2023-03-09,0,0.001632,-0.015329,0.000693,0.016038,0.007777


### 5-Days in the future stock predictions

In [22]:
# We need al least 5 days from the past without the Target column
last_five_days = df_pred.iloc[-5:].copy()
last_five_days.drop('Target',inplace=True,axis=1)

# We need to add the Predicted column for future predictions
new_columns = last_five_days.columns[-5:].to_list()
new_columns.append('model_1_Predicted')
new_columns.append('model_2_Predicted')

# We need to prepare an empty dataframe to receive future data
next_five_days = pd.DataFrame(columns=new_columns)

# Now starting from the first of the last 5 days, predict tomorrow Up or Down market, then move forward one day
for i in range(1, 6):
    next_day_m1 = model_1.predict(last_five_days.iloc[[i-1]])
    next_day_m2 = model_2.predict(last_five_days.iloc[[i-1]])
    # next_day = model.predict(last_five_days.iloc[i-1, 1:].values.reshape(1, -1))
    arr_m1 = np.append(last_five_days.iloc[i-1, :].values, next_day_m1[0])
    arr_m2 = np.append(arr_m1,next_day_m2[0])
    arr_df = pd.DataFrame([arr_m2], columns=new_columns)
    next_five_days= pd.concat([next_five_days,arr_df])
    
# Create the next 5 working dates and make an index for the predicted 5 days
import datetime
from pandas.tseries.offsets import BDay
daysdates = [(datetime.datetime.today() + BDay(i)).strftime("%Y-%m-%d")  for i in range(1,6) ]
df_next5days = pd.DataFrame(next_five_days)
df_next5days.index = daysdates
print('\nPredictions for next 5 days:')
df_next5days[['model_1_Predicted','model_2_Predicted']]


Predictions for next 5 days:


Unnamed: 0,model_1_Predicted,model_2_Predicted
2023-03-13,0.0,0.0
2023-03-14,0.0,0.0
2023-03-15,1.0,0.0
2023-03-16,1.0,0.0
2023-03-17,1.0,1.0


In [23]:
# Based on the above, Model_1 is predicting a BUY signal at End of day 2023-03-15, which means to BUY the Open on 2023-03-16
# While Model_2 is predicting to BUY the morning Open the next trading day to 2023-03-17 which 2023-03-20