In [1]:
try:
    import imblearn
except ImportError:
    print("imbalanced-learn not found. Installing...")
    !pip install imbalanced-learn
    print("imbalanced-learn installed successfully!")

# Now import and use imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Importing necessary packages
import os
import numpy as np
import pandas as pd
import logging
import warnings
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import csv

from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.metrics import classification_report, roc_curve, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
import matplotlib.pyplot as plt


warnings.filterwarnings('ignore')

# Setting up logging with a FileHandler
log_file_path = 'logger_log.txt'
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

logger = logging.getLogger(__name__)
logger.addHandler(file_handler)

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [4]:
# Reading the dataset
dataset_path = "/kaggle/input/us-stock-market-2020-to-2024/US Stock Market Dataset.csv"

try:
    # Attempt to read the dataset
    df = pd.read_csv(dataset_path)
    logger.info(f"Dataset loaded successfully from {dataset_path}")
except FileNotFoundError:
    logger.error("Error: Dataset file not found. Please provide the correct file path.")
except Exception as e:
    logger.error(f"An error occurred: {e}")
    
# Rename the first column
df = df.rename(columns={df.columns[0]: 'date_index'})
print(df.columns)

df = df[['date_index','Natural_Gas_Price','Natural_Gas_Vol.']]
df['prev_index'] = df['date_index'] + 1
df.head(5)

Index(['date_index', 'Date', 'Natural_Gas_Price', 'Natural_Gas_Vol.',
       'Crude_oil_Price', 'Crude_oil_Vol.', 'Copper_Price', 'Copper_Vol.',
       'Bitcoin_Price', 'Bitcoin_Vol.', 'Platinum_Price', 'Platinum_Vol.',
       'Ethereum_Price', 'Ethereum_Vol.', 'S&P_500_Price', 'Nasdaq_100_Price',
       'Nasdaq_100_Vol.', 'Apple_Price', 'Apple_Vol.', 'Tesla_Price',
       'Tesla_Vol.', 'Microsoft_Price', 'Microsoft_Vol.', 'Silver_Price',
       'Silver_Vol.', 'Google_Price', 'Google_Vol.', 'Nvidia_Price',
       'Nvidia_Vol.', 'Berkshire_Price', 'Berkshire_Vol.', 'Netflix_Price',
       'Netflix_Vol.', 'Amazon_Price', 'Amazon_Vol.', 'Meta_Price',
       'Meta_Vol.', 'Gold_Price', 'Gold_Vol.'],
      dtype='object')


Unnamed: 0,date_index,Natural_Gas_Price,Natural_Gas_Vol.,prev_index
0,0,2.079,,1
1,1,2.05,161340.0,2
2,2,2.1,142860.0,3
3,3,2.077,139750.0,4
4,4,2.49,3590.0,5


In [5]:
def get_prev_pane(df,y,index_col):
    df['prev_index'] = df[index_col] + y
    column_names = df.columns
    df_tmp = pd.merge(df, df, left_on=index_col, right_on='prev_index', how='inner', suffixes=("_"+str(y),''))
    df_tmp = df_tmp.drop(column_names,axis=1)
    df_tmp = df_tmp.drop(['prev_index'+"_"+str(y),index_col+"_"+str(y)],axis=1)
    return df_tmp

In [6]:
for i in range (0,5,1):
    prev_df = get_prev_pane(df,i,'date_index')
    if i == 0:
        final_df = prev_df
    else:
        final_df = pd.merge(final_df, prev_df, left_index=True, right_index=True)
    
final_df.head(10)

Unnamed: 0,Natural_Gas_Price_0,Natural_Gas_Vol._0,Natural_Gas_Price_1,Natural_Gas_Vol._1,Natural_Gas_Price_2,Natural_Gas_Vol._2,Natural_Gas_Price_3,Natural_Gas_Vol._3,Natural_Gas_Price_4,Natural_Gas_Vol._4
0,2.079,,2.05,161340.0,2.1,142860.0,2.077,139750.0,2.49,3590.0
1,2.05,161340.0,2.1,142860.0,2.077,139750.0,2.49,3590.0,2.712,73020.0
2,2.1,142860.0,2.077,139750.0,2.49,3590.0,2.712,73020.0,2.571,44980.0
3,2.077,139750.0,2.49,3590.0,2.712,73020.0,2.571,44980.0,2.641,65500.0
4,2.49,3590.0,2.712,73020.0,2.571,44980.0,2.641,65500.0,2.45,69160.0
5,2.712,73020.0,2.571,44980.0,2.641,65500.0,2.45,69160.0,2.419,121580.0
6,2.571,44980.0,2.641,65500.0,2.45,69160.0,2.419,121580.0,2.519,138430.0
7,2.641,65500.0,2.45,69160.0,2.419,121580.0,2.519,138430.0,2.697,151820.0
8,2.45,69160.0,2.419,121580.0,2.519,138430.0,2.697,151820.0,2.87,150330.0
9,2.419,121580.0,2.519,138430.0,2.697,151820.0,2.87,150330.0,2.9,228160.0


In [7]:
col_names = final_df.columns[1:].tolist()

# creating a copy of df
df1 = final_df.fillna(0)

#target column name
label_name = 'Natural_Gas_Price_0'


# defining the features and target
X = df1.drop([label_name],axis=1)
y = df1[[label_name]]

# instantiating the scaler
scaler = RobustScaler()

# scaling the continuous featuree
X[col_names] = scaler.fit_transform(X[col_names])

# Get centering and scaling values for each feature
centering = scaler.center_
iqrs  = scaler.scale_

X.head()

Unnamed: 0,Natural_Gas_Vol._0,Natural_Gas_Price_1,Natural_Gas_Vol._1,Natural_Gas_Price_2,Natural_Gas_Vol._2,Natural_Gas_Price_3,Natural_Gas_Vol._3,Natural_Gas_Price_4,Natural_Gas_Vol._4
0,-1.63269,-0.353441,0.470396,-0.332201,0.228859,-0.341971,0.188211,-0.166525,-1.591426
1,0.470282,-0.332201,0.228859,-0.341971,0.188211,-0.166525,-1.591426,-0.072218,-0.683963
2,0.229406,-0.341971,0.188211,-0.166525,-1.591426,-0.072218,-0.683963,-0.132116,-1.050451
3,0.188869,-0.166525,-1.591426,-0.072218,-0.683963,-0.132116,-1.050451,-0.102379,-0.782251
4,-1.585897,-0.072218,-0.683963,-0.132116,-1.050451,-0.102379,-0.782251,-0.183517,-0.734414


In [8]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [9]:
# Defining a dictionary of regression models
regressors = {
    'AdaBoostRegressor': AdaBoostRegressor(),
    'LinearRegression': LinearRegression(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=100, random_state=0),
    'GradientBoostingRegressor': GradientBoostingRegressor(n_estimators=300, max_depth=1, subsample=0.8, max_features=0.2),
    'XGBRegressor': XGBRegressor(objective='reg:squarederror'),
}

In [10]:
# Improved model evaluation and cross-validation
model_results = []

# Looping through models and evaluating their performance
for name, model in regressors.items():
    try:
        logger.info(f"Training and evaluating {name} model.")
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')

        # Model training
        model.fit(X_resampled, y_resampled)
        predict = model.predict(X_test)
        acc = accuracy_score(y_test, predict)
        conf = confusion_matrix(y_test, predict)

        y_pred_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        precision = precision_score(y_test, predict)
        recall = recall_score(y_test, predict)
        f1 = f1_score(y_test, predict)

        # Precision-Recall curve
        precision_recall_thresholds = precision_recall_curve(y_test, y_pred_proba)

        model_results.append({'Model': name, 'Accuracy': acc, 'ROC AUC': roc_auc, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'CV Scores': cv_scores})
        
        print("=============================================================================")
        print(name, "CV Scores:", cv_scores)
        print(name, "Accuracy:", acc)
        print(name, "Confusion Matrix:", conf)
        print(name, "ROC AUC:", roc_auc)
        print(classification_report(y_test, predict))
        
        logger.info(f"{name} - CV Scores: {cv_scores}")
        logger.info(f"{name} - Accuracy: {acc}")
        logger.info(f"{name} - Confusion Matrix:\n{conf}")
        logger.info(f"{name} - ROC AUC: {roc_auc}")
        logger.info(f"{name} - Precision: {precision}")
        logger.info(f"{name} - Recall: {recall}")
        logger.info(f"{name} - F1 Score: {f1}")
        logger.info(classification_report(y_test, predict))

        # Plotting the ROC curve
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label='Knn')
        plt.xlabel('fpr')
        plt.ylabel('tpr')
        plt.title('ROC curve')
        plt.show()
        
        # Plotting the Precision-Recall curve
        plt.plot(precision_recall_thresholds[1], precision_recall_thresholds[0], label=name)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall curve - {name}')
        plt.legend()
        plt.show()
    
    except Exception as ex:
        logger.error(f"An error occurred while processing {name} model: {ex}")

In [11]:
# Save log messages to a file
# File handling: close the FileHandler
file_handler.close()
logger.removeHandler(file_handler)