In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas_ta as ta
import pandas as pd
import joblib

In [26]:
df_nflx_intraday = pd.read_csv('data_nflx_intraday.csv', index_col=0, parse_dates=True) # will be our working df
df = df_nflx_intraday.copy() # working df
df.columns = df.columns.str.lower()

In [27]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-07-27 04:00:00,423.01,425.61,423.01,425.0,1150.0
2023-07-27 04:01:00,425.43,427.26,425.0,427.26,677.0
2023-07-27 04:02:00,427.19,427.19,426.0,426.7,341.0
2023-07-27 04:03:00,426.76,427.01,426.65,426.84,140.0
2023-07-27 04:04:00,426.84,427.64,426.84,427.49,117.0


In [28]:
# adding a few indicators
df['rsi'] = ta.rsi(df['close'], length=30)
df['ad'] = ta.ad(df['high'], df['low'], df['close'], df['volume']) # Accumulation/Distribution Line
bbands = ta.bbands(df['close'], length=10, mamode="ema") #  Returns: pd.DataFrame: lower, mid, upper, bandwidth, and percent columns.
df[['bbandsl', 'bbandsm', 'bbandsu','bbandsb', 'bbandsp']] = bbands[['BBL_10_2.0', 'BBM_10_2.0', 'BBU_10_2.0', 'BBB_10_2.0', 'BBP_10_2.0']]

# getting rid of nan
df.dropna(inplace=True)

In [29]:
# dividing X and y
X = df.drop(columns=['close'])
y = df[['close']]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.062063230273589044


In [32]:
joblib.dump(linear_model, 'linear_model.pkl')

['linear_model.pkl']