In [1]:
# Importing necessary packages
import os
import numpy as np
import pandas as pd
import logging
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import csv

from sklearn.metrics import classification_report, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import matplotlib.pyplot as plt

import sqlite3

# Setting up options and ignoring warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

# Setting up logging with a FileHandler
log_file_path = 'classification_log.txt'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

logger = logging.getLogger(__name__)
logger.addHandler(file_handler)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def add_df_panel(df,timestamp_col,timestamp_back):
    
    df[timestamp_col + '_A'] = df[timestamp_col] - timestamp_back
    col_names_orig = df.columns
    
    col_join1 = timestamp_col
    col_join2 = timestamp_col +'_A'
    final =  pd.merge(df, df, left_on=col_join2, right_on=col_join1)
    final = final.sort_values(by=[timestamp_col+'_x'])
    
    col_names_new = final.columns
    
    col_names_update = []
    index = 0
    for name in col_names_new:
        if index < len(col_names_orig):
            col_names_update.append(col_names_orig[index])
        else:
            col_names_update.append(name)
        index = index + 1 
    
    final.columns = col_names_update
    final = final.drop([timestamp_col+'_A_y'], axis=1)
    
    col_names_orig = df.columns[1:]
    final = final.drop(col_names_orig, axis=1)
        
    return final

In [4]:
def create_df_panel_parquet(asset_id,start,back_window,increment,timestamp_col):
    df = train_df[train_df["Asset_ID"] == asset_id]
    col_names = df.columns

    for x in range(start,(back_window+1),1):

        time_back = increment * x

        print("Round:",x, "Time Back",time_back)

        new_panel_df = add_df_panel(df[col_names].copy(),timestamp_col,time_back)
        new_panel_df.columns = new_panel_df.columns.str.replace("_y", ("_"+str(x).zfill(6)))   
        df =  pd.merge(df, new_panel_df, how='left', left_on=timestamp_col, right_on=timestamp_col)

    df = df.sort_values(by=[timestamp_col])

    filename = 'AssetID_' + str(asset_id).zfill(6) + "_Start_" + str(start).zfill(6) + "_End_" + str(back_window).zfill(6)
    df.to_parquet(filename + '.parquet.gzip',compression='gzip')
    
    return filename

In [5]:
# Reading the dataset
dataset_path = "/kaggle/input/us-stock-market-2020-to-2024/US Stock Market Dataset.csv"

try:
    # Attempt to read the dataset
    df = pd.read_csv(dataset_path)
    logger.info(f"Dataset loaded successfully from {dataset_path}")
except FileNotFoundError:
    logger.error("Error: Dataset file not found. Please provide the correct file path.")
except Exception as e:
    logger.error(f"An error occurred: {e}")
    
df.head(5)

Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,Platinum_Price,Platinum_Vol.,Ethereum_Price,Ethereum_Vol.,S&P_500_Price,Nasdaq_100_Price,Nasdaq_100_Vol.,Apple_Price,Apple_Vol.,Tesla_Price,Tesla_Vol.,Microsoft_Price,Microsoft_Vol.,Silver_Price,Silver_Vol.,Google_Price,Google_Vol.,Nvidia_Price,Nvidia_Vol.,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,2/2/2024,2.079,,72.28,,3.8215,,43194.7,42650,901.6,,2309.28,246890,4958.61,17642.73,315620000.0,185.85,102550000,187.91,110610000,411.22,28260000,22.796,,142.38,62500000,661.6,47660000,589498,10580,564.64,4030000,171.81,117220000,474.99,84710000,2053.7,
1,1,1/2/2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690,922.3,,2304.28,323610,4906.19,17344.71,240640000.0,186.86,53490000,188.86,90680000,403.78,29230000,23.236,85160.0,141.16,37120000,630.27,36020000,581600,9780,567.51,3150000,159.28,66360000,394.78,25140000,2071.1,260920.0
2,2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480,932.6,,2283.14,408790,4848.87,17137.24,366450000.0,184.4,54830000,187.29,102270000,397.58,46780000,23.169,66910.0,140.1,71370000,615.27,45070000,578020,9720,564.11,4830000,155.2,49690000,390.14,20010000,2067.4,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130,931.7,,2343.11,387120,4924.97,17476.71,236210000.0,188.04,55270000,191.59,105540000,408.59,29340000,23.225,53370.0,151.46,33060000,627.74,39600000,584680,9750,562.85,6120000,159.0,42290000,400.06,18610000,2050.9,214590.0
4,4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230,938.3,,2317.79,318840,4927.93,17596.27,238750000.0,191.73,46890000,190.93,123600000,409.72,23290000,23.134,330.0,153.51,27590000,624.65,33900000,578800,13850,575.79,6880000,161.26,42840000,401.02,17790000,2034.9,1780.0


In [6]:
"""
for x in range(start,back_window,file_step_size):
    print("======================================================================================================")
    print("Start",(x+1),"End",(x+file_step_size))
    save_filename = create_df_panel_parquet(asset_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""



In [7]:
"""
for investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    
    for x in range(start,back_window,file_step_size):        
        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""

"\nfor investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    \n    for x in range(start,back_window,file_step_size):        \n        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)\n"

In [8]:
"""
%%time
first_flag = 1
for filename in fileList:
    print(filename)
    df = pd.read_parquet(filename, engine='pyarrow')
    
    if first_flag == 1:
        final = df.copy()
        first_flag = 0
    else:
        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)
        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')
final = final.sort_values(by=['timestamp'])
"""

"\n%%time\nfirst_flag = 1\nfor filename in fileList:\n    print(filename)\n    df = pd.read_parquet(filename, engine='pyarrow')\n    \n    if first_flag == 1:\n        final = df.copy()\n        first_flag = 0\n    else:\n        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)\n        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')\nfinal = final.sort_values(by=['timestamp'])\n"