In [34]:
!pip install yfinance



In [35]:
import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np

In [36]:
# Step 0: Set variables
start_date = "2023-01-01"
end_date = "2024-07-05"

In [37]:
# Step 1: Download AAPL stock data
ticker = "AAPL"
data = yf.download(ticker, start=start_date, end=end_date, interval="1h")


[*********************100%%**********************]  1 of 1 completed


In [38]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-03 09:30:00-05:00,130.279999,130.899994,125.230003,125.459999,125.459999,27775505
2023-01-03 10:30:00-05:00,125.459999,125.870003,124.730003,125.345001,125.345001,18104999
2023-01-03 11:30:00-05:00,125.350197,125.370003,124.349098,124.660004,124.660004,11248777
2023-01-03 12:30:00-05:00,124.660004,124.940002,124.190002,124.649902,124.649902,8860080
2023-01-03 13:30:00-05:00,124.669899,125.0,124.190002,124.57,124.57,8388062


In [39]:
data.shape

(2628, 6)

In [40]:
data['Date'] = data.index.date
data['Hour'] = data.index.hour
data['Minute'] = data.index.minute
data = data.pivot_table(values='Close', index='Date', columns=['Hour', 'Minute'])


In [41]:
# Flatten the multi-index columns
data.columns = [f'feature_{hour}_{minute}' for hour, minute in data.columns]
data = data.reset_index()

In [42]:
data.head()

Unnamed: 0,Date,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30
0,2023-01-03,125.459999,125.345001,124.660004,124.649902,124.57,124.620003,125.050003
1,2023-01-04,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997
2,2023-01-05,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997
3,2023-01-06,126.375,127.245003,128.329895,128.25,129.335007,129.809998,129.520004
4,2023-01-09,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002


In [43]:
# Step 2: Create target variable (next day's closing price)
target = data.shift(-1)
target

Unnamed: 0,Date,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30
0,2023-01-04,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997
1,2023-01-05,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997
2,2023-01-06,126.375000,127.245003,128.329895,128.250000,129.335007,129.809998,129.520004
3,2023-01-09,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002
4,2023-01-10,129.410004,129.029999,129.119995,129.659897,130.361206,130.149994,130.750000
...,...,...,...,...,...,...,...,...
372,2024-06-28,214.639999,214.339996,213.839996,213.830093,213.210007,213.054993,210.610001
373,2024-07-01,215.229996,215.013306,216.505005,217.254303,215.934998,215.535004,216.729996
374,2024-07-02,218.709900,219.510193,219.164993,219.399994,220.029999,219.619995,220.369995
375,2024-07-03,219.380005,220.115005,221.300003,221.089996,,,


In [44]:
# Drop the last row as it won't have a target
data = data[:-1]
target = target[:-1]

In [45]:
# Combine the features and target into one DataFrame
df = data.copy()
df['Target'] = target.iloc[:, -1].values

In [46]:
df

Unnamed: 0,Date,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30,Target
0,2023-01-03,125.459999,125.345001,124.660004,124.649902,124.570000,124.620003,125.050003,126.379997
1,2023-01-04,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997,125.019997
2,2023-01-05,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997,129.520004
3,2023-01-06,126.375000,127.245003,128.329895,128.250000,129.335007,129.809998,129.520004,130.190002
4,2023-01-09,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002,130.750000
...,...,...,...,...,...,...,...,...,...
371,2024-06-26,212.360001,213.210007,214.345001,213.559998,214.350006,213.261993,213.119995,214.110001
372,2024-06-27,213.809998,212.970505,213.854996,214.000000,213.419998,213.828903,214.110001,210.610001
373,2024-06-28,214.639999,214.339996,213.839996,213.830093,213.210007,213.054993,210.610001,216.729996
374,2024-07-01,215.229996,215.013306,216.505005,217.254303,215.934998,215.535004,216.729996,220.369995


In [47]:
# Convert index to datetime
df.Date = pd.to_datetime(df.Date)

In [48]:
# Step 3: Split the data into training and test sets
train_df = df.loc[df.Date < pd.to_datetime(end_date) - pd.DateOffset(months=1)]
test_df = df.loc[df.Date >= pd.to_datetime(end_date) - pd.DateOffset(months=1)]

train_df = train_df[train_df['Target'].notnull()]
test_df = test_df[test_df['Target'].notnull()]


X_train = train_df.drop(columns=['Target'])
y_train = train_df['Target']
X_test = test_df.drop(columns=['Target'])
y_test = test_df['Target']


In [49]:
del X_train['Date']
del X_test['Date']

In [50]:
X_train

Unnamed: 0,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30
0,125.459999,125.345001,124.660004,124.649902,124.570000,124.620003,125.050003
1,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997
2,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997
3,126.375000,127.245003,128.329895,128.250000,129.335007,129.809998,129.520004
4,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002
...,...,...,...,...,...,...,...
352,191.850006,191.483704,191.759995,191.794998,191.560104,191.139999,190.369995
353,191.445007,191.695007,191.259995,192.078705,191.535004,191.565002,191.289993
354,191.690002,190.410004,190.197205,190.570007,190.554993,191.029999,192.470001
355,194.835007,194.130005,193.289398,193.789993,193.100006,193.354004,194.089996


In [51]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [53]:
# Step 4: Apply MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
# Step 5: Train Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Step 6: Print metrics
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

In [55]:
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

train_metrics = {
    "MAE": mae_train,
    "MSE": mse_train,
    "MAPE": mape_train
}

test_metrics = {
    "MAE": mae_test,
    "MSE": mse_test,
    "MAPE": mape_test
}

(train_metrics, test_metrics)

({'MAE': 1.7161297120360592,
  'MSE': 5.165764289250772,
  'MAPE': 0.009912839128475844},
 {'MAE': 3.5859474134865,
  'MSE': 23.930987407144702,
  'MAPE': 0.017053063333957005})

In [56]:
for x, y in zip(y_test, y_test_pred):
  print(x, y)

194.47999572753906 195.7241127457856
196.89999389648438 194.44148847037002
193.1300048828125 196.20296532755975
207.13999938964844 192.77031301336314
213.1116943359375 205.51900513774598
214.1300048828125 212.06045280548298
212.52000427246094 212.4114406270656
216.7100067138672 211.78321855736988
214.3800048828125 215.43741426703895
209.55999755859375 213.64441546887565
207.11000061035156 209.43231986862799
208.14999389648438 208.36350551973504
209.0760040283203 207.82146791387737
213.1199951171875 208.80131584977045
214.11000061035156 213.17562736755332
210.61000061035156 213.44828570347153
216.72999572753906 210.4392679344851
220.3699951171875 214.98006370410525


### Classification

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [58]:
df

Unnamed: 0,Date,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30,Target
0,2023-01-03,125.459999,125.345001,124.660004,124.649902,124.570000,124.620003,125.050003,126.379997
1,2023-01-04,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997,125.019997
2,2023-01-05,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997,129.520004
3,2023-01-06,126.375000,127.245003,128.329895,128.250000,129.335007,129.809998,129.520004,130.190002
4,2023-01-09,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002,130.750000
...,...,...,...,...,...,...,...,...,...
371,2024-06-26,212.360001,213.210007,214.345001,213.559998,214.350006,213.261993,213.119995,214.110001
372,2024-06-27,213.809998,212.970505,213.854996,214.000000,213.419998,213.828903,214.110001,210.610001
373,2024-06-28,214.639999,214.339996,213.839996,213.830093,213.210007,213.054993,210.610001,216.729996
374,2024-07-01,215.229996,215.013306,216.505005,217.254303,215.934998,215.535004,216.729996,220.369995


In [59]:
df['Target'] = (df['Target']>df['feature_15_30']).astype(int)

In [60]:
df.head()

Unnamed: 0,Date,feature_9_30,feature_10_30,feature_11_30,feature_12_30,feature_13_30,feature_14_30,feature_15_30,Target
0,2023-01-03,125.459999,125.345001,124.660004,124.649902,124.57,124.620003,125.050003,1
1,2023-01-04,125.998299,127.379997,128.065002,127.260101,125.724998,125.830002,126.379997,0
2,2023-01-05,126.769997,126.559898,126.503197,126.480003,125.888702,125.160004,125.019997,1
3,2023-01-06,126.375,127.245003,128.329895,128.25,129.335007,129.809998,129.520004,1
4,2023-01-09,132.565002,133.085007,132.744995,132.494995,131.494995,130.449997,130.190002,1


In [61]:
# Step 3: Split the data into training and test sets
train_df = df.loc[df.Date < pd.to_datetime(end_date) - pd.DateOffset(months=1)]
test_df = df.loc[df.Date >= pd.to_datetime(end_date) - pd.DateOffset(months=1)]

train_df = train_df[train_df['Target'].notnull()]
test_df = test_df[test_df['Target'].notnull()]

X_train = train_df.drop(columns=['Target'])
y_train = train_df['Target']
X_test = test_df.drop(columns=['Target'])
y_test = test_df['Target']

In [62]:
del X_train['Date']
del X_test['Date']

In [63]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [64]:
# Step 4: Apply MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
# Step 5: Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)


# Step 6: Print metrics
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
y_train_prob = model.predict_proba(X_train_scaled)[:, 1]
y_test_prob = model.predict_proba(X_test_scaled)[:, 1]

In [66]:
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)
auc_train = roc_auc_score(y_train, y_train_prob)

accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
auc_test = roc_auc_score(y_test, y_test_prob)

train_metrics = {
    "Accuracy": accuracy_train,
    "Precision": precision_train,
    "Recall": recall_train,
    "F1 Score": f1_train,
    "AUC": auc_train
}

test_metrics = {
    "Accuracy": accuracy_test,
    "Precision": precision_test,
    "Recall": recall_test,
    "F1 Score": f1_test,
    "AUC": auc_test
}

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Train Metrics: {'Accuracy': 0.5098039215686274, 'Precision': 0.5340909090909091, 'Recall': 0.7305699481865285, 'F1 Score': 0.6170678336980306, 'AUC': 0.556520914950082}
Test Metrics: {'Accuracy': 0.42105263157894735, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0, 'AUC': 0.5568181818181819}


  _warn_prf(average, modifier, msg_start, len(result))
