In [1]:
from datetime import datetime, timedelta
import joblib
import json

import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

from src.stockmarket import StockHistory
from src.date_tools import days_from_date
from src.models.models import model_predict


pd.set_option('display.max_columns', None)

In [16]:
import random
benefits = []
gains = []


for year in range(2018, 2024):
    pred_count = []
    tracks = []
    tracks_yearly = []
    tracks_pred = []
    maes = []
    if year == 2023:
        stop_month = 4
    else:
        stop_month = 13
    
    for month in range(1, stop_month):  # Run for 10 iterations or any desired number
        for day in range(1, 28, 5):
            df = pd.read_csv('./data/training_data/stock_metrics.csv')
            df['date'] = pd.to_datetime(df['date'])
            start_date = pd.Timestamp(f'{year}-{month}-{day}')
            end_date = start_date + timedelta(days=120)

            # Filter data for training and testing
            train_df = df[(df['date'] < start_date) | (df['date'] > end_date)].copy()
            test_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)].copy()

            # Preprocess the data
            train_df['sale_days_ago'] = train_df['sale_days_ago'].fillna(-1)
            train_df['purchase_days_ago'] = train_df['purchase_days_ago'].fillna(-1)
            test_df['sale_days_ago'] = test_df['sale_days_ago'].fillna(-1)
            test_df['purchase_days_ago'] = test_df['purchase_days_ago'].fillna(-1)

            # Create Training Data
            x_train = train_df.drop(columns=['date', 'purchase_owner', 'sale_owner', 'sale_speculation', 'purchase_speculation', 'price_change' , 'ticker'])
            y_train = train_df['price_change']

            # Create Testing Data
            x_test = test_df.drop(columns=['date', 'purchase_owner', 'sale_owner', 'sale_speculation', 'purchase_speculation', 'price_change', 'ticker'])
            y_test = test_df['price_change']

            xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100, random_state=42)

            xg_reg.fit(x_train, y_train)

            y_pred = xg_reg.predict(x_test)


            predictions = test_df.copy()
            predictions['prediction_growth'] = y_pred
            predictions['actual_growth'] = y_test

            predictions = predictions.sort_values(by='prediction_growth', ascending=False)
            predictions = predictions.reset_index()

            top = 5
            top_ten = predictions.head(top)

            gains.extend(top_ten['actual_growth'].to_list())
            pred_count.append(len(predictions))

            avg_actual = top_ten['actual_growth'].mean()
            avg_prediction = top_ten['prediction_growth'].mean()
            avg_yearly = predictions['actual_growth'].mean()
            mae = mean_absolute_error(top_ten['actual_growth'], top_ten['prediction_growth'])
            # mse = mean_squared_error(top_ten['prediction_growth'], top_ten['prediction_growth'])

            tracks.append(avg_actual)
            tracks_yearly.append(avg_yearly)
            tracks_pred.append(avg_prediction)
            maes.append(mae)


    avg = sum(tracks) / len(tracks)
    avg_pred = sum(tracks_pred) / len(tracks_pred)
    benefit = avg - avg_yearly

    percentage = True
    if percentage:
        avg = round((avg - 1) * 100, 2)
        avg_pred = round((avg_pred - 1) * 100, 2)
        avg_yearly = round((avg_yearly - 1)  * 100, 2)
        benefit = round(benefit * 100, 2)


    maes_avg = sum(maes) / len(maes)
    avg_num_pred = int(sum(pred_count) / len(pred_count))
    print(f"Year: {year}")
    print(f"Test Size: {avg_num_pred} samples")
    print(f"Average Actual (top-{top}): {avg}%")
    #print(f"Average Prediction (top-{top}): {avg_pred}%")
    print(f"Average Yearly: {avg_yearly}%")
    #print(f"Average MAE: {maes_avg} (top-{top})")
    print(f"Top {top} Benefit: {benefit}%\n")
    benefits.append(benefit)

print(f"- - FINAL RESULTS - -")
print(f"Average Benefit: {sum(benefits) / len(benefits)}")
print(f"Average Gain: {sum(gains) / len(gains)}")
    
    


In [360]:
# predictions = x_test
# predictions['ticker'] = tickers
# predictions['prediction_growth'] = y_pred
# predictions['actual_growth'] = y

# predictions = predictions.sort_values(by='prediction_growth', ascending=False)
# predictions = predictions.reset_index()

# top = 10
# top_ten = predictions.head(top)
# for i,item in top_ten.iterrows():
#     print(f"Rank {i+1}: {item['ticker']}")
#     print(f"Prediction: {round(item['prediction_growth'], 2)}")
#     print(f"Actual: {item['actual_growth']}")
#     print('')
    

# avg_actual = predictions['actual_growth'].head(top).mean()
# avg_prediction = predictions['prediction_growth'].head(top).mean()

# print(f"Predicted Top 10 Growth: {avg_prediction}")
# print(f"Actual Top 10 Growth: {avg_actual}")


# # print(predictions.head(n=15))