In [51]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [52]:
symbols_valid_meta = pd.read_csv('data/symbols_valid_meta.csv')
symbols_valid_meta = symbols_valid_meta[['Symbol', 'Security Name']]

In [53]:
etfs_files = os.listdir('data/etfs')
stocks_files = os.listdir('data/stocks')
columns = ['Symbol', 'Security Name', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [54]:
random.seed(42)
etfs_files = random.sample(etfs_files, 100)
stocks_files = random.sample(stocks_files, 100)

In [41]:
etf_df_list = []
for etf in etfs_files:
    etf_df = pd.read_csv(f'data/etfs/{etf}')
    symbol = etf.replace('.csv', '')
    etf_df['Symbol'] = symbol
    etf_merged_df = etf_df.merge(symbols_valid_meta, on='Symbol', how='left')
    etf_final_df = etf_merged_df[columns]
    etf_df_list.append(etf_final_df)

etfs_df = pd.concat(etf_df_list)

columns = ['Symbol', 'Security Name', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
stock_df_list = []
stocks_df = pd.DataFrame()
for stock in stocks_files:
    stock_df = pd.read_csv(f'data/stocks/{stock}')
    symbol = stock.replace('.csv', '')
    stock_df['Symbol'] = symbol
    stock_merged_df = stock_df.merge(symbols_valid_meta, on='Symbol', how='left')
    stock_final_df = stock_merged_df[columns]
    stocks_df = stocks_df.append(stock_final_df)
    stock_df_list.append(stock_final_df)

stocks_df = pd.concat(stock_df_list)

data = pd.concat([stocks_df, etfs_df])

# Calculate the rolling average of the trading volume (Volume)
data['vol_moving_avg'] = data.groupby('Symbol')['Volume'].transform(lambda x: x.rolling(window=30).mean())

# Calculate the rolling median of the Adjusted Close (Adj Close)
data['adj_close_rolling_med'] = data.groupby('Symbol')['Adj Close'].transform(lambda x: x.rolling(window=30).median())


In [42]:
# Assume `data` is loaded as a Pandas DataFrame
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Remove rows with NaN values
data.dropna(inplace=True)

# Select features and target
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Create a list of regression models
models = [LinearRegression(), Lasso(), Ridge(), DecisionTreeRegressor(), SVR(), KNeighborsRegressor(), RandomForestRegressor(n_estimators=100, random_state=42)]

report = []
# Train and evaluate each model
for model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = model.predict(X_test)
    
    # Calculate the Mean Absolute Error and Mean Squared Error
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Print the model name and the evaluation metrics
    report.append([type(model).__name__, mae, mse])

In [44]:
pd.DataFrame(report, columns=['model', 'MAE', 'MSE'])


Unnamed: 0,model,MAE,MSE
0,LinearRegression,455399.4,1388402000000.0
1,Lasso,455399.4,1388402000000.0
2,Ridge,455399.4,1388402000000.0
3,DecisionTreeRegressor,560043.3,1951927000000.0
4,SVR,1077131.0,5130811000000.0
5,KNeighborsRegressor,513288.7,1682489000000.0
6,RandomForestRegressor,449360.6,1444240000000.0


In [49]:
models = [
    RandomForestRegressor(n_estimators=50, random_state=42),
    RandomForestRegressor(n_estimators=100, random_state=42),
    RandomForestRegressor(n_estimators=500, random_state=42),
    RandomForestRegressor(n_estimators=1000, random_state=42),
]
report = []
# Train and evaluate each model
for model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = model.predict(X_test)
    
    # Calculate the Mean Absolute Error and Mean Squared Error
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Print the model name and the evaluation metrics
    report.append([f"{type(model).__name__}(n_estimators={len(model.estimators_)})", mae, mse])

pd.DataFrame(report, columns=['model', 'MAE', 'MSE'])

Unnamed: 0,model,MAE,MSE
0,RandomForestRegressor(n_estimators=50),450510.857447,1509658000000.0
1,RandomForestRegressor(n_estimators=100),449360.605636,1444240000000.0
2,RandomForestRegressor(n_estimators=500),449946.668335,1464033000000.0
3,RandomForestRegressor(n_estimators=1000),449528.192406,1469937000000.0


1000