In [None]:
import pandas as pd
import os

# Directory containing all the files
data_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'

# Function to load and check columns of horse and race data for a given year
def load_and_check_columns(year, data_dir):
    horse_file = os.path.join(data_dir, f'horses_{year}.csv')
    race_file = os.path.join(data_dir, f'races_{year}.csv')
    
    if os.path.exists(horse_file) and os.path.exists(race_file):
        horses = pd.read_csv(horse_file, low_memory=False)
        races = pd.read_csv(race_file, low_memory=False)
        
        print(f"Columns in horses_{year}.csv: {horses.columns}")
        print(f"Columns in races_{year}.csv: {races.columns}")
    else:
        print(f"Files for year {year} not found.")

# Check columns for each year from 1990 to 2020
for year in range(1990, 2021):
    load_and_check_columns(year, data_dir)


In [None]:
import pandas as pd
import os

# Directory containing all the files
data_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'

# Function to load and merge horse and race data for a given year
def load_and_merge_yearly_data(year, data_dir):
    horse_file = os.path.join(data_dir, f'horses_{year}.csv')
    race_file = os.path.join(data_dir, f'races_{year}.csv')
    
    if os.path.exists(horse_file) and os.path.exists(race_file):
        horses = pd.read_csv(horse_file, low_memory=False)
        races = pd.read_csv(race_file, low_memory=False)
        
        # Ensure 'rid' exists in both datasets
        if 'rid' in horses.columns and 'rid' in races.columns:
            merged_data = pd.merge(horses, races, on='rid')
            print(f"Successfully merged data for year {year}")
            return merged_data
        else:
            print(f"'rid' column not found in one of the files for year {year}. Skipping...")
            return pd.DataFrame()
    else:
        print(f"Files for year {year} not found. Skipping...")
        return pd.DataFrame()  # Return an empty DataFrame if the files don't exist

# Load and merge data from 1990 to 2020
all_data = pd.DataFrame()

for year in range(1990, 2021):
    yearly_data = load_and_merge_yearly_data(year, data_dir)
    if not yearly_data.empty:
        all_data = pd.concat([all_data, yearly_data], ignore_index=True)

print("All yearly data loaded and merged.")
print("Columns in all_data:", all_data.columns)

# Load forward.csv
forward_file = os.path.join(data_dir, 'forward.csv')
forward_data = pd.read_csv(forward_file, low_memory=False)
print("Forward data loaded.")
print("Columns in forward_data:", forward_data.columns)

# Ensure necessary columns exist in both datasets before merging
merge_columns = ['course', 'title', 'rclass', 'horseName', 'trainerName', 'jockeyName']
if all(col in all_data.columns for col in merge_columns) and all(col in forward_data.columns for col in merge_columns):
    complete_data = pd.merge(all_data, forward_data, on=merge_columns, how='left')
    print("Forward data merged with all yearly data.")
else:
    print("One of the necessary columns not found in one of the datasets. Merging skipped.")
    complete_data = all_data

# Select important columns that are present in the DataFrame
important_columns = [
    'rid', 'res_win', 'horseName', 'trainerName', 'jockeyName', 'date', 'course', 'age', 'weight', 
    'distance', 'condition', 'position', 'decimalPrice', 'RPR', 'TR', 'OR'
]
existing_columns = [col for col in important_columns if col in complete_data.columns]
complete_data = complete_data.loc[:, existing_columns]
print("Selected important columns.")

# Handle missing values
complete_data.ffill(inplace=True)
print("Handled missing values using forward fill.")

# Ensure date is in 'dd/mm/yy' format
if 'date' in complete_data.columns:
    complete_data['date'] = pd.to_datetime(complete_data['date'], errors='coerce')  # Handle parsing errors
    complete_data['date'] = complete_data['date'].dt.strftime('%d/%m/%y')
    print("Date column formatted to 'dd/mm/yy'.")

# Process distance column (convert to numeric meters if applicable)
if 'distance' in complete_data.columns:
    def convert_distance_to_meters(distance):
        if isinstance(distance, str):
            try:
                # Handle format like "1m2f" (1 mile 2 furlongs)
                if 'm' in distance and 'f' in distance:
                    parts = distance.split('m')
                    miles = int(parts[0])
                    furlongs = int(parts[1].replace('f', ''))
                    return miles * 1609 + furlongs * 201.168
                # Handle format like "1m" (1 mile)
                elif 'm' in distance:
                    return int(distance.replace('m', '')) * 1609
                # Handle format like "2f" (2 furlongs)
                elif 'f' in distance:
                    return int(distance.replace('f', '')) * 201.168
            except ValueError:
                # Return NaN if the format is invalid
                return None
        return distance  # Leave unchanged if already numeric or non-string

    complete_data['distance'] = complete_data['distance'].apply(convert_distance_to_meters)
    complete_data['distance'] = pd.to_numeric(complete_data['distance'], errors='coerce')
    print("Distance column processed and converted to numeric meters.")

# Convert categorical variables to numerical representations
categorical_columns = ['course', 'trainerName', 'jockeyName']
for column in categorical_columns:
    if column in complete_data.columns:
        complete_data[column] = complete_data[column].astype('category').cat.codes
print("Encoded categorical variables to numerical representations.")

# Save the final cleaned dataset
output_file = os.path.join(data_dir, 'cleaned_final_dataset.csv')
complete_data.to_csv(output_file, index=False)

print("Dataset creation completed and saved as 'cleaned_final_dataset.csv'.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Function to add value labels on bars
def add_value_labels(ax):
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.0f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 10),
                   textcoords='offset points')

# Load the dataset
df = pd.read_csv('D:/GUVI_Projects/My_Projects/new_horse/Horse/cleaned_final_dataset.csv', low_memory=True)

# Ensure the columns are treated as strings
df['course'] = df['course'].astype(str)
df['trainerName'] = df['trainerName'].astype(str)
df['jockeyName'] = df['jockeyName'].astype(str)

# Handle missing values
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values
imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')

df[numeric_cols] = imputer_numeric.fit_transform(df[numeric_cols])
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])

# Count the occurrences of each unique value in the columns
course_counts = df['course'].value_counts().reset_index()
trainer_counts = df['trainerName'].value_counts().reset_index()
jockey_counts = df['jockeyName'].value_counts().reset_index()

# Rename the columns for clarity
course_counts.columns = ['course', 'count']
trainer_counts.columns = ['trainerName', 'count']
jockey_counts.columns = ['jockeyName', 'count']

# Example mapping dictionaries (replace with actual mappings if available)
trainer_name_mapping = {9907: "Trainer A", 15408: "Trainer B", 15396: "Trainer C"}
jockey_name_mapping = {4783: "Jockey A", 4740: "Jockey B", 15750: "Jockey C"}

# Replace IDs with names
trainer_counts['trainerName'] = trainer_counts['trainerName'].astype(int).map(trainer_name_mapping).fillna(trainer_counts['trainerName'])
jockey_counts['jockeyName'] = jockey_counts['jockeyName'].astype(int).map(jockey_name_mapping).fillna(jockey_counts['jockeyName'])

# Bar plots for categorical data
def bar_plot(data, x, y, title, xlabel, ylabel, rotation=90):
    plt.figure(figsize=(12, 8))
    ax = sns.barplot(data=data, x=x, y=y, order=data.sort_values(y, ascending=False)[x])
    plt.xticks(rotation=rotation, fontsize=10)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    add_value_labels(ax)
    plt.tight_layout()
    plt.show()

# Course Plot
if not course_counts.empty:
    bar_plot(course_counts.head(20), 'course', 'count', 'Top 20 Courses by Count', 'Course', 'Count')
else:
    print("No data available for 'course'.")

# Trainer Plot
if not trainer_counts.empty:
    bar_plot(trainer_counts.head(20), 'trainerName', 'count', 'Top 20 Trainers', 'Trainer Name', 'Count')
else:
    print("No data available for 'trainerName'.")

# Jockey Plot
if not jockey_counts.empty:
    bar_plot(jockey_counts.head(20), 'jockeyName', 'count', 'Top 20 Jockeys', 'Jockey Name', 'Count')
else:
    print("No data available for 'jockeyName'.")

# Additional Visualizations: Horse Names and Positions
horse_counts = df['horseName'].value_counts().reset_index()
horse_counts.columns = ['horseName', 'count']
if not horse_counts.empty:
    bar_plot(horse_counts.head(20), 'horseName', 'count', 'Top 20 Horses by Count', 'Horse Name', 'Count')
else:
    print("No data available for 'horseName'.")

position_counts = df['position'].value_counts().reset_index()
position_counts.columns = ['position', 'count']
if not position_counts.empty:
    bar_plot(position_counts, 'position', 'count', 'Race Positions by Count', 'Position', 'Count')
else:
    print("No data available for 'position'.")

# --------------------------------------
# Enhanced EDA: Correlation Analysis
# --------------------------------------

# Distribution plots for numeric columns
for col in numeric_cols:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True, bins=30, color="blue")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

# Correlation Matrix and Heatmap
correlation_matrix = df[numeric_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Scatter Plots for Numeric Relationships
# Reducing dataset for scatter plot to avoid legend issues
subset_df = df.sample(1000)  # Adjust the sample size as necessary
for i in range(len(numeric_cols)):
    for j in range(i + 1, len(numeric_cols)):
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=subset_df, x=numeric_cols[i], y=numeric_cols[j], hue='position', palette="tab10", legend=False)
        plt.title(f"Scatter Plot: {numeric_cols[i]} vs {numeric_cols[j]}")
        plt.tight_layout()
        plt.show()

# --------------------------------------
# Feature Importance Analysis using Mutual Information
# --------------------------------------

# Encode categorical features for mutual information analysis
encoded_df = df.copy()
le = LabelEncoder()
for col in categorical_cols:
    encoded_df[col] = le.fit_transform(df[col])

# Specify a target column for MI analysis
target_column = 'res_win'  # Replace with your actual target column
if target_column in df.columns:
    mi_scores = mutual_info_classif(encoded_df[numeric_cols], encoded_df[target_column], discrete_features=False)
    mi_df = pd.DataFrame({'Feature': numeric_cols, 'Mutual Information': mi_scores})
    mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

    # Plot Mutual Information Scores
    plt.figure(figsize=(12, 8))
    sns.barplot(data=mi_df, x='Mutual Information', y='Feature', palette='viridis')
    plt.title("Feature Importance using Mutual Information")
    plt.xlabel("Mutual Information Score")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.show()
else:
    print("Target column not found for mutual information analysis.")

# --------------------------------------
# Interactive Plots using Plotly
# --------------------------------------

# Interactive plot for top 20 courses by count
fig = px.bar(course_counts.head(20), x='course', y='count', title='Top 20 Courses by Count')
fig.show()

# Interactive plot for top 20 trainers by count
fig = px.bar(trainer_counts.head(20), x='trainerName', y='count', title='Top 20 Trainers by Count')
fig.show()

# Interactive plot for top 20 jockeys by count
fig = px.bar(jockey_counts.head(20), x='jockeyName', y='count', title='Top 20 Jockeys by Count')
fig.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle
import os

# Paths
dataset_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse/cleaned_final_dataset.csv'
save_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'
model_save_path = os.path.join(save_path, 'horse_model.pkl')

# Function to optimize memory usage
def optimize_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type == 'float64' or col_type == 'float32':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'int64' or col_type == 'int32':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

# Load data in chunks and handle mixed types
chunk_size = 100000  # Define chunk size
chunks = pd.read_csv(
    dataset_path, 
    chunksize=chunk_size, 
    low_memory=False
)

# Process and combine chunks
processed_chunks = []
for chunk in chunks:
    # Drop irrelevant columns
    chunk.drop(['horseName', 'date', 'rid'], axis=1, inplace=True)
    
    # Encode categorical columns
    le_trainer = LabelEncoder()
    le_jockey = LabelEncoder()
    chunk['trainerName'] = le_trainer.fit_transform(chunk['trainerName'])
    chunk['jockeyName'] = le_jockey.fit_transform(chunk['jockeyName'])
    
    # Feature Engineering
    chunk['speed_ratio'] = chunk['distance'] / (chunk['position'] + 1)  # Example feature
    chunk['success_rate'] = chunk['RPR'] / (chunk['TR'] + 1)  # Prevent division by zero
    
    # Replace infinities and large values
    chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
    chunk.fillna(chunk.median(), inplace=True)  # Handle missing values
    
    # Optimize memory
    chunk = optimize_memory(chunk)
    
    processed_chunks.append(chunk)

# Combine processed chunks
data = pd.concat(processed_chunks, ignore_index=True)

# Drop low-variance features
low_variance_cols = [col for col in data.columns if data[col].nunique() == 1]
data.drop(columns=low_variance_cols, inplace=True)

# Define target and features
X = data.drop(['res_win'], axis=1)
y = data['res_win']

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier with Hyperparameter Tuning
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_

# Evaluate the model
y_pred = best_rf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Save the model
with open(model_save_path, 'wb') as f:
    pickle.dump(best_rf, f)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print(f"Model saved to: {model_save_path}")
# Fitting 3 folds for each of 50 candidates, totalling 150 fits
# Confusion Matrix:
#  [[741781      0]
#  [     0  79684]]

# Classification Report:
#                precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    741781
#          1.0       1.00      1.00      1.00     79684

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465

# Model saved to: D:/GUVI_Projects/My_Projects/new_horse/Horse\horse_model.pkl


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle
import os
import logging
import traceback

# Set up logging
log_file = 'D:/GUVI_Projects/My_Projects/new_horse/Horse/script_log.txt'
logging.basicConfig(filename=log_file, level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Function to log exceptions
def log_exception(exc):
    logging.error(f"Exception occurred: {exc}")
    logging.error("".join(traceback.format_exception(None, exc, exc.__traceback__)))

# Paths
dataset_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse/cleaned_final_dataset.csv'
save_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'
gb_model_save_path = os.path.join(save_path, 'horse_gb_model.pkl')
lr_model_save_path = os.path.join(save_path, 'horse_lr_model.pkl')
processed_chunks_path = os.path.join(save_path, 'processed_chunks')

os.makedirs(processed_chunks_path, exist_ok=True)

# Function to optimize memory usage
def optimize_memory(df):
    try:
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in ['float64', 'float32']:
                df[col] = pd.to_numeric(df[col], downcast='float')
            elif col_type in ['int64', 'int32']:
                df[col] = pd.to_numeric(df[col], downcast='integer')
        logging.info("Memory optimization completed.")
    except Exception as e:
        log_exception(e)
        logging.error("Error during memory optimization.")
    return df

# Initialize LabelEncoders
le_trainer = LabelEncoder()
le_jockey = LabelEncoder()

chunk_size = 50000
processed_chunks = []

try:
    for i, chunk in enumerate(pd.read_csv(dataset_path, chunksize=chunk_size, low_memory=False)):
        try:
            logging.info(f"Processing chunk {i}...")

            # Ensure required columns exist
            required_columns = ['trainerName', 'jockeyName', 'distance', 'position', 'RPR', 'TR', 'res_win']
            if not all(col in chunk.columns for col in required_columns):
                raise ValueError(f"One or more required columns missing in chunk {i}")

            # Drop irrelevant columns
            chunk.drop(['horseName', 'date', 'rid'], axis=1, errors='ignore', inplace=True)

            # Encode categorical columns
            chunk['trainerName'] = le_trainer.fit_transform(chunk['trainerName'])
            chunk['jockeyName'] = le_jockey.fit_transform(chunk['jockeyName'])

            # Feature Engineering
            chunk['speed_ratio'] = chunk['distance'] / (chunk['position'] + 1)
            chunk['success_rate'] = chunk['RPR'] / (chunk['TR'] + 1)

            # Replace infinities and handle missing values
            chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
            chunk.fillna(chunk.median(), inplace=True)

            # Optimize memory
            chunk = optimize_memory(chunk)

            # Save processed chunk for future use
            processed_chunks.append(chunk)
            chunk.to_csv(os.path.join(processed_chunks_path, f'chunk_{i}.csv'), index=False)

            logging.info(f"Processed chunk {i} successfully.")

        except Exception as e:
            log_exception(e)
            logging.warning(f"Failed to process chunk {i}. Skipping this chunk.")
            continue

    # Combine processed chunks
    data = pd.concat(processed_chunks, ignore_index=True)
    logging.info("Chunks combined successfully.")

    # Drop low-variance features
    low_variance_cols = [col for col in data.columns if data[col].nunique() == 1]
    data.drop(columns=low_variance_cols, inplace=True)

    # Define target and features
    X = data.drop(['res_win'], axis=1)
    y = data['res_win']

    logging.info("Data preparation completed.")

except Exception as e:
    log_exception(e)
    logging.error("Error during data preparation.")

try:
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    logging.info("Data split into training and testing sets.")

    # Define SMOTE and under-sampling
    smote = SMOTE(random_state=42)
    under_sampler = RandomUnderSampler(random_state=42)

    # Gradient Boosting Classifier with Hyperparameter Tuning
    gb = GradientBoostingClassifier(random_state=42)
    gb_pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                                  ('smote', smote),
                                  ('under', under_sampler),
                                  ('classifier', gb)])

    gb_param_dist = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.05, 0.1],
        'classifier__max_depth': [3, 5, 7],
        'classifier__subsample': [0.8, 1.0],
        'classifier__min_samples_split': [2, 5, 10]
    }

    try:
        gb_random_search = RandomizedSearchCV(estimator=gb_pipeline, param_distributions=gb_param_dist,
                                              n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
        gb_random_search.fit(X_train, y_train)
        best_gb_pipeline = gb_random_search.best_estimator_

        with open(gb_model_save_path, 'wb') as f:
            pickle.dump(best_gb_pipeline, f)
        logging.info(f"Gradient Boosting model saved to: {gb_model_save_path}")

    except Exception as gb_e:
        log_exception(gb_e)
        logging.error("Gradient Boosting model training failed.")
        best_gb_pipeline = None

    # Logistic Regression with Hyperparameter Tuning
    lr = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
    lr_pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                                  ('smote', smote),
                                  ('under', under_sampler),
                                  ('classifier', lr)])

    lr_param_dist = {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'lbfgs'],
        'classifier__penalty': ['l2']
    }

    lr_random_search = RandomizedSearchCV(estimator=lr_pipeline, param_distributions=lr_param_dist,
                                          n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
    lr_random_search.fit(X_train, y_train)
    best_lr_pipeline = lr_random_search.best_estimator_

    with open(lr_model_save_path, 'wb') as f:
        pickle.dump(best_lr_pipeline, f)
    logging.info(f"Logistic Regression model saved to: {lr_model_save_path}")

    # Evaluate models
    if best_gb_pipeline:
        logging.info("Evaluating Gradient Boosting model...")
        y_pred_gb = best_gb_pipeline.predict(X_test)
        conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
        class_report_gb = classification_report(y_test, y_pred_gb)
        logging.info("Gradient Boosting Confusion Matrix:\n" + str(conf_matrix_gb))
        logging.info("Gradient Boosting Classification Report:\n" + class_report_gb)

    logging.info("Evaluating Logistic Regression model...")
    y_pred_lr = best_lr_pipeline.predict(X_test)
    conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
    class_report_lr = classification_report(y_test, y_pred_lr)
    logging.info("Logistic Regression Confusion Matrix:\n" + str(conf_matrix_lr))
    logging.info("Logistic Regression Classification Report:\n" + class_report_lr)

except Exception as e:
    log_exception(e)
    logging.error("Error during model training and evaluation.")



# Gradient Boosting Confusion Matrix:
# [[742297      0]
#  [     0  79168]]
# 2024-12-05 03:48:06,554 - INFO - Gradient Boosting Classification Report:
#               precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    742297
#          1.0       1.00      1.00      1.00     79168

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465

# 2024-12-05 03:48:06,554 - INFO - Evaluating Logistic Regression model...
# 2024-12-05 03:48:08,365 - INFO - Logistic Regression Confusion Matrix:
# [[742187    110]
#  [     0  79168]]
# 2024-12-05 03:48:08,365 - INFO - Logistic Regression Classification Report:
#               precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    742297
#          1.0       1.00      1.00      1.00     79168

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465

