In [1]:
# Importing necessary packages
import os
import numpy as np
import pandas as pd
import logging
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import csv

from sklearn.metrics import classification_report, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import matplotlib.pyplot as plt


warnings.filterwarnings('ignore')

# Setting up logging with a FileHandler
log_file_path = 'logger_log.txt'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

logger = logging.getLogger(__name__)
logger.addHandler(file_handler)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [3]:
def add_df_panel(df,timestamp_col,timestamp_back):
    
    # Subtract one day from the timestamp column
    df[timestamp_col + '_A'] = df[timestamp_col] - timestamp_back
    col_names_orig = df.columns
    
    col_join1 = timestamp_col
    col_join2 = timestamp_col +'_A'
    
    final =  pd.merge(df, df, left_on=col_join2, right_on=col_join1)
    final = final.sort_values(by=[timestamp_col+'_x'])
    
    col_names_new = final.columns
    
    col_names_update = []
    index = 0
    
    for name in col_names_new:
        if index < len(col_names_orig):
            col_names_update.append(col_names_orig[index])
        else:
            col_names_update.append(name)
        index = index + 1 
    
    final.columns = col_names_update
    final = final.drop([timestamp_col+'_A_y'], axis=1)
    
    col_names_orig = df.columns[1:]
    final = final.drop(col_names_orig, axis=1)
        
    return final

In [129]:
# Reading the dataset
dataset_path = "/kaggle/input/us-stock-market-2020-to-2024/US Stock Market Dataset.csv"

try:
    # Attempt to read the dataset
    df = pd.read_csv(dataset_path)
    logger.info(f"Dataset loaded successfully from {dataset_path}")
except FileNotFoundError:
    logger.error("Error: Dataset file not found. Please provide the correct file path.")
except Exception as e:
    logger.error(f"An error occurred: {e}")
    
df

Unnamed: 0.1,Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,0,2/2/2024,2.079,,72.28,,3.8215,,43194.70,42650,...,589498,10580,564.64,4030000,171.81,117220000,474.99,84710000,2053.70,
1,1,1/2/2024,2.050,161340.0,73.82,577940.0,3.8535,,43081.40,47690,...,581600,9780,567.51,3150000,159.28,66360000,394.78,25140000,2071.10,260920.0
2,2,31-01-2024,2.100,142860.0,75.85,344490.0,3.9060,,42580.50,56480,...,578020,9720,564.11,4830000,155.20,49690000,390.14,20010000,2067.40,238370.0
3,3,30-01-2024,2.077,139750.0,77.82,347240.0,3.9110,,42946.20,55130,...,584680,9750,562.85,6120000,159.00,42290000,400.06,18610000,2050.90,214590.0
4,4,29-01-2024,2.490,3590.0,76.78,331930.0,3.8790,,43299.80,45230,...,578800,13850,575.79,6880000,161.26,42840000,401.02,17790000,2034.90,1780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008,1008,8/1/2020,2.141,242560.0,59.61,1210000.0,2.8120,77700.0,8059.60,1190000,...,339188,190,339.26,7110000,94.60,70240000,215.22,13490000,1560.20,813410.0
1009,1009,7/1/2020,2.162,163010.0,62.70,582650.0,2.7935,59470.0,8155.70,1010000,...,338901,280,330.75,4740000,95.34,82680000,213.06,15110000,1574.30,435870.0
1010,1010,6/1/2020,2.135,154680.0,63.27,724240.0,2.7900,59570.0,7759.10,786750,...,340210,280,335.83,5670000,95.14,81310000,212.60,17070000,1568.80,558970.0
1011,1011,3/1/2020,2.130,144670.0,63.05,885860.0,2.7870,74750.0,7343.10,936290,...,339155,220,325.90,3800000,93.75,75330000,208.67,11200000,1552.40,436740.0


In [130]:
# Assuming 'Date' is the name of the column with inconsistent date formats
#df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
#df.head(10)

# Assuming 'your_column' is the name of the column you want to modify
#df['Date'] = df['Date'].str.replace('/', '-')
#df.head(20)

# Pad the dates with leading zeros
#df['Date'] = df['Date'].dt.strftime('%m-%d-%Y')

# Assuming 'your_column' is the name of the column you want to convert to a date
#df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%y')

# Rename the first column
df = df.rename(columns={df.columns[0]: 'date_index'})

df = df[['date_index','Natural_Gas_Price']]
df['prev_index'] = df['date_index'] + 1
df.head(5)

Unnamed: 0,date_index,Natural_Gas_Price,prev_index
0,0,2.079,1
1,1,2.05,2
2,2,2.1,3
3,3,2.077,4
4,4,2.49,5


In [105]:
df['prev_index'] = df['date_index'] + 2
df.head(5)

Unnamed: 0,date_index,Natural_Gas_Price,prev_index
0,0,2.079,2
1,1,2.05,3
2,2,2.1,4
3,3,2.077,5
4,4,2.49,6


In [131]:
y = 1
df['prev_index'] = df['date_index'] + y
df = pd.merge(df, df, left_on='date_index', right_on='prev_index', how='inner', suffixes=(y,''))
df = df.drop(['date_index1','prev_index1','prev_index'],axis=1)
df

Unnamed: 0,Natural_Gas_Price1,date_index,Natural_Gas_Price
0,2.050,0,2.079
1,2.100,1,2.050
2,2.077,2,2.100
3,2.490,3,2.077
4,2.712,4,2.490
...,...,...,...
1007,2.141,1007,2.166
1008,2.162,1008,2.141
1009,2.135,1009,2.162
1010,2.130,1010,2.135


In [None]:
y = 1
#merged_df['prev_index'] = merged_df['date_index'] + y
#merged_df = pd.merge(df,merged_df, left_on='prev_index', right_on='date_index', how='inner', suffixes=('_src','_dst'))

# Define the specific suffix you want to rename
#old_suffix = '_src'
#new_suffix = ''
#columns_to_rename = [col for col in merged_df.columns if col.endswith(old_suffix)]
#merged_df = merged_df.rename(columns={col: col.replace(old_suffix, new_suffix) for col in columns_to_rename})

df

In [17]:
def create_df_panel(index_column_name,df,start,end,back_window):
    
    counter = 0
        
    for x in range(start,(end+1),1):    
        print("==================================")
        print("Round:",x)
        counter_in = 0    
        
        df1 = df[df[index_column_name] == x]
                           
        for x1 in range(x+1,(x+back_window+1),1):
            print("Window:",x1)
            
            df2 = df[df[index_column_name] == x1]
            df2 = df2.add_suffix('_'+str(x1))

            if counter_in == 0:
                final =  pd.concat([df1.reset_index(),df2.reset_index()], axis=1)
                counter_in = 1
            else:
                final =  pd.concat([final.reset_index(),df2.reset_index()], axis=1)
            
        if counter == 0:
            counter = 1
            final_df = final
        else:
            final_df = pd.concat([final_df, final], ignore_index=True)
        
    return final_df
            
index_column_name = 'date_index'
start = 0
end = 0
back_window = 3
final = create_df_panel(index_column_name,df,start,end,back_window)
print(final.columns)
final

Round: 0
Window: 1
Window: 2
Window: 3


ValueError: cannot insert level_0, already exists

In [7]:
import pandas as pd

# Create two example DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})

# Concatenate along columns (axis=1)
result = pd.concat([df1, df2], axis=1)

print(result)

   A  B  C   D
0  1  4  7  10
1  2  5  8  11
2  3  6  9  12


In [8]:
"""
for x in range(start,back_window,file_step_size):
    print("======================================================================================================")
    print("Start",(x+1),"End",(x+file_step_size))
    save_filename = create_df_panel_parquet(asset_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""



In [9]:
"""
for investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    
    for x in range(start,back_window,file_step_size):        
        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)
"""

"\nfor investment_id in tqdm(investment_id_ls, desc = 'Progress Bar: Creating Files'):    \n    for x in range(start,back_window,file_step_size):        \n        save_filename = create_df_panel_parquet(investment_id,(x+1),(x+file_step_size),increment,timestamp_col)\n"

In [10]:
"""
%%time
first_flag = 1
for filename in fileList:
    print(filename)
    df = pd.read_parquet(filename, engine='pyarrow')
    
    if first_flag == 1:
        final = df.copy()
        first_flag = 0
    else:
        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)
        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')
final = final.sort_values(by=['timestamp'])
"""

"\n%%time\nfirst_flag = 1\nfor filename in fileList:\n    print(filename)\n    df = pd.read_parquet(filename, engine='pyarrow')\n    \n    if first_flag == 1:\n        final = df.copy()\n        first_flag = 0\n    else:\n        df = df.drop(['Asset_ID','Count','Open','High','Low','Close','Volume','VWAP','Target'],axis=1,inplace=False)\n        final = pd.merge(final, df, left_on='timestamp', right_on='timestamp')\nfinal = final.sort_values(by=['timestamp'])\n"

In [None]:
index_column_name = 'date_index'
start = 0
end = 0
back_window = 3
final = create_df_panel(index_column_name,df,start,end,back_window)
print(final.columns)
final

In [35]:
import pandas as pd

# Define a function to create windowed features for all columns
def create_window_features(dataframe, window_size=1):
    for col in dataframe.columns:
        for i in range(1, window_size + 1):
            dataframe[f'{col}_{i}'] = dataframe[col].shift(i)

# Apply the window function with a window size of 3
window_size = 1
create_window_features(df, window_size)

# Drop rows with NaN values resulting from the shift
df = df.dropna()

# Display the resulting DataFrame
df

Unnamed: 0,date_index,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,...,Gold_Price_3_3_1,Gold_Vol._1_1_1,Gold_Vol._1_2_1,Gold_Vol._1_3_1,Gold_Vol._2_1_1,Gold_Vol._2_2_1,Gold_Vol._2_3_1,Gold_Vol._3_1_1,Gold_Vol._3_2_1,Gold_Vol._3_3_1
40,40,5/12/2023,2.667,42490.0,72.32,358700.0,3.7840,96110.0,44076.20,96840,...,2035.20,237290.0,160070.0,570.0,160070.0,570.0,2220.0,570.0,2220.0,560.0
41,41,4/12/2023,2.651,53830.0,73.04,388830.0,3.8355,95060.0,41987.80,104210,...,1988.10,173980.0,237290.0,160070.0,237290.0,160070.0,570.0,160070.0,570.0,2220.0
42,42,1/12/2023,2.773,48470.0,74.07,358980.0,3.9315,106080.0,38688.20,62500,...,1983.90,154220.0,173980.0,237290.0,173980.0,237290.0,160070.0,237290.0,160070.0,570.0
43,43,30-11-2023,2.802,147750.0,75.96,559170.0,3.8505,76490.0,37712.90,33530,...,1993.70,209550.0,154220.0,173980.0,154220.0,173980.0,237290.0,173980.0,237290.0,160070.0
44,44,29-11-2023,2.804,141590.0,77.86,322170.0,3.8255,76390.0,37855.50,49340,...,2014.50,378220.0,209550.0,154220.0,209550.0,154220.0,173980.0,154220.0,173980.0,237290.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,863,7/8/2020,2.238,206250.0,41.22,399000.0,2.8020,310.0,11592.00,517000,...,2013.10,459840.0,349890.0,241800.0,349890.0,241800.0,289440.0,241800.0,289440.0,361480.0
864,864,6/8/2020,2.165,161990.0,41.95,359610.0,2.9190,290.0,11757.10,554850,...,1998.70,565000.0,459840.0,349890.0,459840.0,349890.0,241800.0,349890.0,241800.0,289440.0
865,865,5/8/2020,2.191,182430.0,42.19,491270.0,2.9250,30.0,11735.10,570830,...,1949.80,251310.0,565000.0,459840.0,565000.0,459840.0,349890.0,459840.0,349890.0,241800.0
866,866,4/8/2020,2.193,230890.0,41.70,451580.0,2.9030,50.0,11184.70,485790,...,1970.40,398130.0,251310.0,565000.0,251310.0,565000.0,459840.0,565000.0,459840.0,349890.0


In [11]:
# Save log messages to a file
# File handling: close the FileHandler
file_handler.close()
logger.removeHandler(file_handler)