In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from src.stockmarket import StockHistory
from src.date_tools import days_from_date


pd.set_option('display.max_columns', None)

In [2]:
def load_json_metrics():
    file_path = "./data/training_data/trading_metrics.json"
    with open(file_path, 'r') as file:
        data = json.load(file)
        
    df = pd.DataFrame(data['scoring_metrics'])
    stock_history = StockHistory(start_date="2012-01-01", end_date=datetime.now().date().strftime("%Y-%m-%d"))
    deltas = []
    
    for i,row in tqdm(df.iterrows(), total=len(df)):
        date = row['date']
        future_date = days_from_date(date_str=date, days=365)
        current_price = stock_history.price(ticker=row['ticker'], date_str=date)
        future_price = stock_history.price(ticker=row['ticker'], date_str=future_date)
        if future_price and current_price:
            price_change = future_price / current_price
            price_change = round(price_change, 2)
            deltas.append(price_change)
        else:
            deltas.append(None)

    df['price_change'] = deltas
    # Remove any rows that do not have price_change
    df = df.dropna(subset=['price_change']).reset_index(drop=True)
    df.to_csv('./data/disclosures/stock_metrics.csv', index=False)
    return df


df = load_json_metrics() #pd.read_csv('./data/disclosures/stock_metrics.csv')
print(df.shape)
df.head()

  0%|          | 0/10924 [00:00<?, ?it/s]

(10487, 19)


Unnamed: 0,ticker,purchase_volume,purchase_speculation,purchase_count,purchase_count_individual,purchase_days_ago,purchase_owner,purchase_confidence,sale_volume,sale_speculation,sale_count,sale_count_individual,sale_days_ago,sale_owner,sale_confidence,date,volume_net,score,price_change
0,DIS,18.0,0,16,8,44.06,"[Thomas R Carper, Ron L Wyden, Earl Blumenauer...",1.828125,6.146707,0,6,6,31.83,"[Steve Cohen, K. Michael Conaway, Gilbert Cisn...",-0.043,2020-04-15,11.853293,43.23,1.8
1,MSFT,18.864858,125,14,5,39.86,"[Thomas H Tuberville, Josh Gottheimer, Daniel ...",1.485,9.910082,0,8,4,52.62,"[Shelley M Capito, Thomas H Tuberville, Kathy ...",-0.024923,2022-12-01,8.954777,40.26,1.48
2,AMZN,17.0,0,14,12,47.21,"[John Curtis, Ron L Wyden, Dean Phillips, Susa...",1.828125,4.0,0,4,4,56.0,"[Gilbert Cisneros, John Curtis, Cheri Bustos, ...",-0.071,2020-04-15,13.0,39.35,1.46
3,AMZN,20.9016,0,18,10,62.89,"[John Curtis, Ron L Wyden, Dean Phillips, Susa...",1.618,5.0,0,5,4,60.2,"[Gilbert Cisneros, Cheri Bustos, Donald Sterno...",-0.071,2020-06-14,15.9016,33.95,1.33
4,AAPL,18.365672,0,18,9,55.5,"[Dean Phillips, Josh Gottheimer, Katherine M. ...",1.618,13.0,0,12,11,61.92,"[Mikie Sherrill, John B. Larson, Kim Schrier, ...",-0.064337,2020-04-15,5.365672,33.61,1.91


In [4]:
# Preprocess the data
df['sale_days_ago'] = df['sale_days_ago'].fillna(-1)
df['purchase_days_ago'] = df['purchase_days_ago'].fillna(-1)

# Create binary features for purchase and sale occurrence
df['purchase_occurred'] = (df['purchase_days_ago'] != -1).astype(int)
df['sale_occurred'] = (df['sale_days_ago'] != -1).astype(int)

# Define features and target
x = df.drop(columns=['date', 'purchase_owner', 'sale_owner', 'sale_speculation', 'purchase_speculation', 'price_change'])
y = df['price_change']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

tickers = x_test['ticker']
x_train = x_train.drop(columns=['ticker'])
x_test = x_test.drop(columns=['ticker'])

x_train.head()



Unnamed: 0,purchase_volume,purchase_count,purchase_count_individual,purchase_days_ago,purchase_confidence,sale_volume,sale_count,sale_count_individual,sale_days_ago,sale_confidence,volume_net,score,purchase_occurred,sale_occurred
6341,0.0,0,0,-1.0,0.0,2.0,2,1,35.0,-0.153939,-2.0,0.47,0,1
4170,1.0,1,1,23.0,0.811818,0.0,0,0,-1.0,0.0,1.0,1.43,1,0
8299,0.0,0,0,-1.0,0.0,1.0,1,1,106.0,-0.478023,-1.0,0.12,0,1
5816,0.0,0,0,-1.0,0.0,1.0,1,1,7.0,-0.298182,-1.0,0.62,0,1
10001,0.0,0,0,-1.0,0.0,2.0,1,1,83.0,0.038,-2.0,-0.05,0,1


In [5]:
# Initialize the model
gbr = GradientBoostingRegressor(random_state=42)

# Fit the model to the training data
gbr.fit(x_train, y_train)

# Make predictions on the test set
y_pred = gbr.predict(x_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.18813464258146256


In [6]:
predictions = x_test
predictions['ticker'] = tickers
predictions['prediction_growth'] = y_pred
predictions['actual_growth'] = y_test

predictions = predictions.sort_values(by='prediction_growth', ascending=False)

avg_actual = predictions['actual_growth'].head(10).mean()
avg_prediction = predictions['prediction_growth'].head(10).mean()

print(f"Predicted Top 10 Growth: {avg_prediction}")
print(f"Actual Top 10 Growth: {avg_actual}")


print(predictions.head(n=15))

Predicted Top 10 Growth: 1.9675810141894048
Actual Top 10 Growth: 1.65
      purchase_volume  purchase_count  purchase_count_individual  \
6692         1.000000               1                          1   
483          6.000000               6                          3   
106          3.365672               3                          3   
5391         0.000000               0                          0   
5386         0.000000               0                          0   
1660         1.000000               1                          1   
3101         1.000000               1                          1   
5507         0.000000               0                          0   
7247         0.000000               0                          0   
4771         1.000000               1                          1   
216          6.146707               6                          3   
3014         1.000000               1                          1   
5538         1.000000               1        

In [7]:

joblib.dump(gbr,'./data/models/gradient_boosting_regressor.joblib')

['./data/models/gradient_boosting_regressor.joblib']