In [1]:
# Importing necessary packages
import os
import numpy as np
import pandas as pd
import logging
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import csv

from sklearn.metrics import classification_report, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import matplotlib.pyplot as plt


warnings.filterwarnings('ignore')

# Setting up logging with a FileHandler
log_file_path = 'logger_log.txt'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

logger = logging.getLogger(__name__)
logger.addHandler(file_handler)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def add_df_panel(df,timestamp_col,timestamp_back):
    
    # Subtract one day from the timestamp column
    df[timestamp_col + '_A'] = df[timestamp_col] - timestamp_back
    col_names_orig = df.columns
    
    col_join1 = timestamp_col
    col_join2 = timestamp_col +'_A'
    
    final =  pd.merge(df, df, left_on=col_join2, right_on=col_join1)
    final = final.sort_values(by=[timestamp_col+'_x'])
    
    col_names_new = final.columns
    
    col_names_update = []
    index = 0
    
    for name in col_names_new:
        if index < len(col_names_orig):
            col_names_update.append(col_names_orig[index])
        else:
            col_names_update.append(name)
        index = index + 1 
    
    final.columns = col_names_update
    final = final.drop([timestamp_col+'_A_y'], axis=1)
    
    col_names_orig = df.columns[1:]
    final = final.drop(col_names_orig, axis=1)
        
    return final

In [4]:
def create_df_panel_parquet(asset_id,start,back_window,increment,timestamp_col):
    df = train_df[train_df["Asset_ID"] == asset_id]
    col_names = df.columns

    for x in range(start,(back_window+1),1):

        time_back = increment * x

        print("Round:",x, "Time Back",time_back)

        new_panel_df = add_df_panel(df[col_names].copy(),timestamp_col,time_back)
        new_panel_df.columns = new_panel_df.columns.str.replace("_y", ("_"+str(x).zfill(6)))   
        df =  pd.merge(df, new_panel_df, how='left', left_on=timestamp_col, right_on=timestamp_col)

    df = df.sort_values(by=[timestamp_col])

    filename = 'AssetID_' + str(asset_id).zfill(6) + "_Start_" + str(start).zfill(6) + "_End_" + str(back_window).zfill(6)
    df.to_parquet(filename + '.parquet.gzip',compression='gzip')
    
    return filename

In [5]:
# Reading the dataset
dataset_path = "/kaggle/input/us-stock-market-2020-to-2024/US Stock Market Dataset.csv"

try:
    # Attempt to read the dataset
    df = pd.read_csv(dataset_path)
    logger.info(f"Dataset loaded successfully from {dataset_path}")
except FileNotFoundError:
    logger.error("Error: Dataset file not found. Please provide the correct file path.")
except Exception as e:
    logger.error(f"An error occurred: {e}")
    
df

Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,2/2/2024,2.079,,72.28,,3.8215,,43194.70,42650,...,589498,10580,564.64,4030000,171.81,117220000,474.99,84710000,2053.70,
1,1,1/2/2024,2.050,161340.0,73.82,577940.0,3.8535,,43081.40,47690,...,581600,9780,567.51,3150000,159.28,66360000,394.78,25140000,2071.10,260920.0
2,2,31-01-2024,2.100,142860.0,75.85,344490.0,3.9060,,42580.50,56480,...,578020,9720,564.11,4830000,155.20,49690000,390.14,20010000,2067.40,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.9110,,42946.20,55130,...,584680,9750,562.85,6120000,159.00,42290000,400.06,18610000,2050.90,214590.0
4,4,29-01-2024,2.490,3590.0,76.78,331930.0,3.8790,,43299.80,45230,...,578800,13850,575.79,6880000,161.26,42840000,401.02,17790000,2034.90,1780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008,1008,8/1/2020,2.141,242560.0,59.61,1210000.0,2.8120,77700.0,8059.60,1190000,...,339188,190,339.26,7110000,94.60,70240000,215.22,13490000,1560.20,813410.0
1009,1009,7/1/2020,2.162,163010.0,62.70,582650.0,2.7935,59470.0,8155.70,1010000,...,338901,280,330.75,4740000,95.34,82680000,213.06,15110000,1574.30,435870.0
1010,1010,6/1/2020,2.135,154680.0,63.27,724240.0,2.7900,59570.0,7759.10,786750,...,340210,280,335.83,5670000,95.14,81310000,212.60,17070000,1568.80,558970.0
1011,1011,3/1/2020,2.130,144670.0,63.05,885860.0,2.7870,74750.0,7343.10,936290,...,339155,220,325.90,3800000,93.75,75330000,208.67,11200000,1552.40,436740.0


In [6]:
# Assuming 'Date' is the name of the column with inconsistent date formats
#df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
#df.head(10)

# Assuming 'your_column' is the name of the column you want to modify
#df['Date'] = df['Date'].str.replace('/', '-')
#df.head(20)

# Pad the dates with leading zeros
#df['Date'] = df['Date'].dt.strftime('%m-%d-%Y')

# Assuming 'your_column' is the name of the column you want to convert to a date
#df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%y')

# Rename the first column
df = df.rename(columns={df.columns[0]: 'date_index'})

big_df = add_df_panel(df,'date_index',1)
big_df

Unnamed: 0,date_index,date_index_y,Date_y,Natural_Gas_Price_y,Natural_Gas_Vol._y,Crude_oil_Price_y,Crude_oil_Vol._y,Copper_Price_y,Copper_Vol._y,Bitcoin_Price_y,...,Berkshire_Price_y,Berkshire_Vol._y,Netflix_Price_y,Netflix_Vol._y,Amazon_Price_y,Amazon_Vol._y,Meta_Price_y,Meta_Vol._y,Gold_Price_y,Gold_Vol._y
0,1,0,2/2/2024,2.079,,72.28,,3.8215,,43194.70,...,589498,10580,564.64,4030000,171.81,117220000,474.99,84710000,2053.70,
1,2,1,1/2/2024,2.050,161340.0,73.82,577940.0,3.8535,,43081.40,...,581600,9780,567.51,3150000,159.28,66360000,394.78,25140000,2071.10,260920.0
2,3,2,31-01-2024,2.100,142860.0,75.85,344490.0,3.9060,,42580.50,...,578020,9720,564.11,4830000,155.20,49690000,390.14,20010000,2067.40,238370.0
3,4,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.9110,,42946.20,...,584680,9750,562.85,6120000,159.00,42290000,400.06,18610000,2050.90,214590.0
4,5,4,29-01-2024,2.490,3590.0,76.78,331930.0,3.8790,,43299.80,...,578800,13850,575.79,6880000,161.26,42840000,401.02,17790000,2034.90,1780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,1008,1007,9/1/2020,2.166,230030.0,59.56,750930.0,2.8020,53120.0,7842.40,...,343263,280,335.66,4720000,95.05,63500000,218.30,12680000,1554.30,372880.0
1008,1009,1008,8/1/2020,2.141,242560.0,59.61,1210000.0,2.8120,77700.0,8059.60,...,339188,190,339.26,7110000,94.60,70240000,215.22,13490000,1560.20,813410.0
1009,1010,1009,7/1/2020,2.162,163010.0,62.70,582650.0,2.7935,59470.0,8155.70,...,338901,280,330.75,4740000,95.34,82680000,213.06,15110000,1574.30,435870.0
1010,1011,1010,6/1/2020,2.135,154680.0,63.27,724240.0,2.7900,59570.0,7759.10,...,340210,280,335.83,5670000,95.14,81310000,212.60,17070000,1568.80,558970.0


In [7]:
big_df.columns

Index(['date_index', 'date_index_y', 'Date_y', 'Natural_Gas_Price_y',
       'Natural_Gas_Vol._y', 'Crude_oil_Price_y', 'Crude_oil_Vol._y',
       'Copper_Price_y', 'Copper_Vol._y', 'Bitcoin_Price_y', 'Bitcoin_Vol._y',
       'Platinum_Price_y', 'Platinum_Vol._y', 'Ethereum_Price_y',
       'Ethereum_Vol._y', 'S&P_500_Price_y', 'Nasdaq_100_Price_y',
       'Nasdaq_100_Vol._y', 'Apple_Price_y', 'Apple_Vol._y', 'Tesla_Price_y',
       'Tesla_Vol._y', 'Microsoft_Price_y', 'Microsoft_Vol._y',
       'Silver_Price_y', 'Silver_Vol._y', 'Google_Price_y', 'Google_Vol._y',
       'Nvidia_Price_y', 'Nvidia_Vol._y', 'Berkshire_Price_y',
       'Berkshire_Vol._y', 'Netflix_Price_y', 'Netflix_Vol._y',
       'Amazon_Price_y', 'Amazon_Vol._y', 'Meta_Price_y', 'Meta_Vol._y',
       'Gold_Price_y', 'Gold_Vol._y'],
      dtype='object')

In [8]:
"""
for x in range(start,back_window,file_step_size):
    print("======================================================================================================")
    print("Start",(x+1),"End",(x+file_step_size))
    save_filename = create_df_panel_parquet(asset_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""



In [9]:
"""
for investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    
    for x in range(start,back_window,file_step_size):        
        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""

"\nfor investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    \n    for x in range(start,back_window,file_step_size):        \n        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)\n"

In [10]:
"""
%%time
first_flag = 1
for filename in fileList:
    print(filename)
    df = pd.read_parquet(filename, engine='pyarrow')
    
    if first_flag == 1:
        final = df.copy()
        first_flag = 0
    else:
        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)
        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')
final = final.sort_values(by=['timestamp'])
"""

"\n%%time\nfirst_flag = 1\nfor filename in fileList:\n    print(filename)\n    df = pd.read_parquet(filename, engine='pyarrow')\n    \n    if first_flag == 1:\n        final = df.copy()\n        first_flag = 0\n    else:\n        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)\n        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')\nfinal = final.sort_values(by=['timestamp'])\n"

In [11]:
# Save log messages to a file
# File handling: close the FileHandler
file_handler.close()
logger.removeHandler(file_handler)