In [8]:
import json
import pandas as pd
from pprint import pprint
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV

from src.stockmarket import StockHistory
from src.date_tools import days_from_date

pd.set_option('display.max_columns', None)

In [9]:
def load_json_metrics():
    file_path = "./data/disclosures/scoring_metrics.json"
    with open(file_path, 'r') as file:
        data = json.load(file)
        
    df = pd.DataFrame(data['scoring_metrics'])
    stock_history = StockHistory(start_date="2012-01-01", end_date=datetime.now().date().strftime("%Y-%m-%d"))
    deltas = []
    
    for i,row in tqdm(df.iterrows(), total=len(df)):
        date = row['date']
        future_date = days_from_date(date_str=date, days=365)
        current_price = stock_history.price(ticker=row['ticker'], date_str=date)
        future_price = stock_history.price(ticker=row['ticker'], date_str=future_date)
        if future_price:
            price_change = future_price / current_price
            price_change = round(price_change, 2)
            deltas.append(price_change)
        else:
            deltas.append(None)

    df['price_change'] = deltas
    # Remove any rows that do not have price_change
    df = df.dropna(subset=['price_change']).reset_index(drop=True)
    df.to_csv('./data/disclosures/stock_metrics.csv', index=False)
    return df


df = pd.read_csv('./data/training_data/stock_metrics.csv')
print(df.shape)
df.head()

(10487, 19)


Unnamed: 0,ticker,purchase_volume,purchase_speculation,purchase_count,purchase_count_individual,purchase_days_ago,purchase_owner,purchase_confidence,sale_volume,sale_speculation,sale_count,sale_count_individual,sale_days_ago,sale_owner,sale_confidence,date,volume_net,score,price_change
0,DIS,18.0,0,16,8,44.06,"['Thomas R Carper', 'Ron L Wyden', 'Earl Blume...",1.828125,6.146707,0,6,6,31.83,"['Steve Cohen', 'K. Michael Conaway', 'Gilbert...",-0.043,2020-04-15,11.853293,43.23,1.8
1,MSFT,18.864858,125,14,5,39.86,"['Thomas H Tuberville', 'Josh Gottheimer', 'Da...",1.485,9.910082,0,8,4,52.62,"['Shelley M Capito', 'Thomas H Tuberville', 'K...",-0.024923,2022-12-01,8.954777,40.26,1.48
2,AMZN,17.0,0,14,12,47.21,"['John Curtis', 'Ron L Wyden', 'Dean Phillips'...",1.828125,4.0,0,4,4,56.0,"['Gilbert Cisneros', 'John Curtis', 'Cheri Bus...",-0.071,2020-04-15,13.0,39.35,1.46
3,AMZN,20.9016,0,18,10,62.89,"['John Curtis', 'Ron L Wyden', 'Dean Phillips'...",1.618,5.0,0,5,4,60.2,"['Gilbert Cisneros', 'Cheri Bustos', 'Donald S...",-0.071,2020-06-14,15.9016,33.95,1.33
4,AAPL,18.365672,0,18,9,55.5,"['Dean Phillips', 'Josh Gottheimer', 'Katherin...",1.618,13.0,0,12,11,61.92,"['Mikie Sherrill', 'John B. Larson', 'Kim Schr...",-0.064337,2020-04-15,5.365672,33.61,1.91


In [10]:


# Drop non-numerical and irrelevant columns

# Define features and target
x = df.drop(columns=['date', 'purchase_owner', 'sale_owner', 'sale_speculation', 'purchase_speculation', 'price_change'])
y = df['price_change']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

tickers = x_test['ticker']
x_train = x_train.drop(columns=['ticker'])
x_test = x_test.drop(columns=['ticker'])


x_train.head()



Unnamed: 0,purchase_volume,purchase_count,purchase_count_individual,purchase_days_ago,purchase_confidence,sale_volume,sale_count,sale_count_individual,sale_days_ago,sale_confidence,volume_net,score
6341,0.0,0,0,,0.0,2.0,2,1,35.0,-0.153939,-2.0,0.47
4170,1.0,1,1,23.0,0.811818,0.0,0,0,,0.0,1.0,1.43
8299,0.0,0,0,,0.0,1.0,1,1,106.0,-0.478023,-1.0,0.12
5816,0.0,0,0,,0.0,1.0,1,1,7.0,-0.298182,-1.0,0.62
10001,0.0,0,0,,0.0,2.0,1,1,83.0,0.038,-2.0,-0.05


In [11]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Evaluate the model
mse = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mse}')

Mean Absolute Error: 0.29182347779734336


In [87]:
predictions = x_test
predictions['ticker'] = tickers
predictions['prediction'] = y_pred
predictions['actual'] = y_test

predictions = predictions.sort_values(by='prediction', ascending=False)

predictions.head(n=10)

Unnamed: 0,purchase_volume,purchase_count,purchase_count_individual,purchase_days_ago,purchase_confidence,sale_volume,sale_count,sale_count_individual,sale_days_ago,sale_confidence,volume_net,ticker,prediction,actual
1533,0.0,0,0,,0.0,0.25,1,1,15.0,-0.955802,-0.25,ALGT,3.138791,3.21
1793,0.0,0,0,,0.0,0.25,1,1,98.0,-0.57163,-0.25,CENT,2.834875,1.12
1775,0.0,0,0,,0.0,0.25,1,1,97.0,-0.57163,-0.25,CNI,2.82865,1.05
1175,0.0,0,0,,0.0,0.5,2,1,48.0,-0.955802,-0.5,NUE,2.38684,2.29
4232,0.25,1,1,24.0,1.107411,0.0,0,0,,0.0,0.25,CTLT,2.175083,0.86
1299,0.0,0,0,,0.0,0.5,2,1,120.0,-0.631053,-0.5,BP,2.0477,1.19
1429,0.0,0,0,,0.0,0.25,1,1,89.0,-0.955802,-0.25,ENTG,2.03583,2.44
1882,0.5,2,2,18.0,1.712563,0.0,0,0,,0.0,0.5,ADC,1.994246,1.18
1345,0.75,3,3,31.0,1.739138,0.25,1,1,28.0,-0.955802,0.5,MOS,1.965525,2.95
602,0.0,0,0,,0.0,1.566757,2,1,114.0,-0.051,-1.566757,PODD,1.943526,2.35
