In [None]:
import pandas as pd
import os

# Directory containing all the files
data_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'

# Function to load and check columns of horse and race data for a given year
def load_and_check_columns(year, data_dir):
    horse_file = os.path.join(data_dir, f'horses_{year}.csv')
    race_file = os.path.join(data_dir, f'races_{year}.csv')
    
    if os.path.exists(horse_file) and os.path.exists(race_file):
        horses = pd.read_csv(horse_file, low_memory=False)
        races = pd.read_csv(race_file, low_memory=False)
        
        print(f"Columns in horses_{year}.csv: {horses.columns}")
        print(f"Columns in races_{year}.csv: {races.columns}")
    else:
        print(f"Files for year {year} not found.")

# Check columns for each year from 1990 to 2020
for year in range(1990, 2021):
    load_and_check_columns(year, data_dir)


In [None]:
import pandas as pd
import os
import json

# Directory containing all the files
data_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'

# Function to load and merge horse and race data for a given year
def load_and_merge_yearly_data(year, data_dir):
    horse_file = os.path.join(data_dir, f'horses_{year}.csv')
    race_file = os.path.join(data_dir, f'races_{year}.csv')
    
    if os.path.exists(horse_file) and os.path.exists(race_file):
        horses = pd.read_csv(horse_file, low_memory=False)
        races = pd.read_csv(race_file, low_memory=False)
        
        # Ensure 'rid' exists in both datasets
        if 'rid' in horses.columns and 'rid' in races.columns:
            merged_data = pd.merge(horses, races, on='rid')
            print(f"Successfully merged data for year {year}")
            return merged_data
        else:
            print(f"'rid' column not found in one of the files for year {year}. Skipping...")
            return pd.DataFrame()
    else:
        print(f"Files for year {year} not found. Skipping...")
        return pd.DataFrame()  # Return an empty DataFrame if the files don't exist

# Load and merge data from 1990 to 2020
all_data = pd.DataFrame()

for year in range(1990, 2021):
    yearly_data = load_and_merge_yearly_data(year, data_dir)
    if not yearly_data.empty:
        all_data = pd.concat([all_data, yearly_data], ignore_index=True)

print("All yearly data loaded and merged.")
print("Columns in all_data:", all_data.columns)

# Load forward.csv
forward_file = os.path.join(data_dir, 'forward.csv')
forward_data = pd.read_csv(forward_file, low_memory=False)
print("Forward data loaded.")
print("Columns in forward_data:", forward_data.columns)

# Ensure necessary columns exist in both datasets before merging
merge_columns = ['course', 'title', 'rclass', 'horseName', 'trainerName', 'jockeyName']
if all(col in all_data.columns for col in merge_columns) and all(col in forward_data.columns for col in merge_columns):
    complete_data = pd.merge(all_data, forward_data, on=merge_columns, how='left')
    print("Forward data merged with all yearly data.")
else:
    print("One of the necessary columns not found in one of the datasets. Merging skipped.")
    complete_data = all_data

# Select important columns that are present in the DataFrame
important_columns = [
    'rid', 'res_win', 'horseName', 'trainerName', 'jockeyName', 'date', 'course', 'age', 'weight', 
    'distance', 'condition', 'position', 'decimalPrice', 'RPR', 'TR', 'OR'
]
existing_columns = [col for col in important_columns if col in complete_data.columns]
complete_data = complete_data.loc[:, existing_columns]
print("Selected important columns.")

# Handle missing values
complete_data.ffill(inplace=True)
print("Handled missing values using forward fill.")

# Ensure date is in 'dd/mm/yy' format
if 'date' in complete_data.columns:
    complete_data['date'] = pd.to_datetime(complete_data['date'], errors='coerce')  # Handle parsing errors
    complete_data['date'] = complete_data['date'].dt.strftime('%d/%m/%y')
    print("Date column formatted to 'dd/mm/yy'.")

# Process distance column (convert to numeric meters if applicable)
if 'distance' in complete_data.columns:
    def convert_distance_to_meters(distance):
        if isinstance(distance, str):
            try:
                # Handle format like "1m2f" (1 mile 2 furlongs)
                if 'm' in distance and 'f' in distance:
                    parts = distance.split('m')
                    miles = int(parts[0])
                    furlongs = int(parts[1].replace('f', ''))
                    return miles * 1609 + furlongs * 201.168
                # Handle format like "1m" (1 mile)
                elif 'm' in distance:
                    return int(distance.replace('m', '')) * 1609
                # Handle format like "2f" (2 furlongs)
                elif 'f' in distance:
                    return int(distance.replace('f', '')) * 201.168
            except ValueError:
                # Return NaN if the format is invalid
                return None
        return distance  # Leave unchanged if already numeric or non-string

    complete_data['distance'] = complete_data['distance'].apply(convert_distance_to_meters)
    complete_data['distance'] = pd.to_numeric(complete_data['distance'], errors='coerce')
    print("Distance column processed and converted to numeric meters.")

# Convert categorical variables to numerical representations and save mappings
categorical_columns = ['course', 'trainerName', 'jockeyName']
mappings = {}
mapping_dir = os.path.join(data_dir, 'mappings')
os.makedirs(mapping_dir, exist_ok=True)

for column in categorical_columns:
    if column in complete_data.columns:
        # Encode categorical variables
        complete_data[column] = complete_data[column].astype('category')
        category_mapping = dict(enumerate(complete_data[column].cat.categories))
        mappings[column] = category_mapping
        
        # Save individual mapping to a JSON file
        mapping_file = os.path.join(mapping_dir, f'{column}_mapping.json')
        with open(mapping_file, 'w') as f:
            json.dump(category_mapping, f, indent=4)
        print(f"Mapping for '{column}' saved as '{mapping_file}'.")

# Save all mappings in a single JSON file
all_mappings_file = os.path.join(mapping_dir, 'all_mappings.json')
with open(all_mappings_file, 'w') as f:
    json.dump(mappings, f, indent=4)
print(f"All mappings saved as '{all_mappings_file}'.")

# Save the final cleaned dataset
output_file = os.path.join(data_dir, 'cleaned_final_dataset.csv')
complete_data.to_csv(output_file, index=False)

print("Dataset creation completed and saved as 'cleaned_final_dataset.csv'.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import os
import json

# Function to add value labels on bars
def add_value_labels(ax):
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.0f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 10),
                   textcoords='offset points')

# Load the dataset
df = pd.read_csv('D:/GUVI_Projects/My_Projects/new_horse/Horse/cleaned_final_dataset.csv', low_memory=True)

# Ensure the columns are treated as strings
df['course'] = df['course'].astype(str)
df['trainerName'] = df['trainerName'].astype(str)
df['jockeyName'] = df['jockeyName'].astype(str)

# Handle missing values
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values
imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')

df[numeric_cols] = imputer_numeric.fit_transform(df[numeric_cols])
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])

# Count the occurrences of each unique value in the columns
course_counts = df['course'].value_counts().reset_index()
trainer_counts = df['trainerName'].value_counts().reset_index()
jockey_counts = df['jockeyName'].value_counts().reset_index()

# Rename the columns for clarity
course_counts.columns = ['course', 'count']
trainer_counts.columns = ['trainerName', 'count']
jockey_counts.columns = ['jockeyName', 'count']

# Example mapping dictionaries (replace with actual mappings if available)
trainer_name_mapping = {9907: "Trainer A", 15408: "Trainer B", 15396: "Trainer C"}
jockey_name_mapping = {4783: "Jockey A", 4740: "Jockey B", 15750: "Jockey C"}

# Replace IDs with names safely
trainer_counts['trainerName'] = trainer_counts['trainerName'].apply(
    lambda x: trainer_name_mapping[int(x)] if str(x).isdigit() and int(x) in trainer_name_mapping else x
)
jockey_counts['jockeyName'] = jockey_counts['jockeyName'].apply(
    lambda x: jockey_name_mapping[int(x)] if str(x).isdigit() and int(x) in jockey_name_mapping else x
)

# Save mappings to a JSON file
mapping_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse/mappings'
os.makedirs(mapping_dir, exist_ok=True)

# Prepare mappings for saving
mappings = {
    "trainerName": trainer_name_mapping,
    "jockeyName": jockey_name_mapping
}

# Save the mappings to a JSON file
mappings_file = os.path.join(mapping_dir, 'all_mappings.json')
with open(mappings_file, 'w') as f:
    json.dump(mappings, f, indent=4)
print(f"All mappings saved as '{mappings_file}'")

# Bar plots for categorical data
def bar_plot(data, x, y, title, xlabel, ylabel, rotation=90):
    plt.figure(figsize=(12, 8))
    ax = sns.barplot(data=data, x=x, y=y, order=data.sort_values(y, ascending=False)[x])
    plt.xticks(rotation=rotation, fontsize=10)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    add_value_labels(ax)
    plt.tight_layout()
    plt.show()

# Course Plot
if not course_counts.empty:
    bar_plot(course_counts.head(20), 'course', 'count', 'Top 20 Courses by Count', 'Course', 'Count')
else:
    print("No data available for 'course'.")

# Trainer Plot
if not trainer_counts.empty:
    bar_plot(trainer_counts.head(20), 'trainerName', 'count', 'Top 20 Trainers', 'Trainer Name', 'Count')
else:
    print("No data available for 'trainerName'.")

# Jockey Plot
if not jockey_counts.empty:
    bar_plot(jockey_counts.head(20), 'jockeyName', 'count', 'Top 20 Jockeys', 'Jockey Name', 'Count')
else:
    print("No data available for 'jockeyName'.")

# Additional Visualizations: Horse Names and Positions
horse_counts = df['horseName'].value_counts().reset_index()
horse_counts.columns = ['horseName', 'count']
if not horse_counts.empty:
    bar_plot(horse_counts.head(20), 'horseName', 'count', 'Top 20 Horses by Count', 'Horse Name', 'Count')
else:
    print("No data available for 'horseName'.")

position_counts = df['position'].value_counts().reset_index()
position_counts.columns = ['position', 'count']
if not position_counts.empty:
    bar_plot(position_counts, 'position', 'count', 'Race Positions by Count', 'Position', 'Count')
else:
    print("No data available for 'position'.")

# --------------------------------------
# Enhanced EDA: Correlation Analysis
# --------------------------------------

# Distribution plots for numeric columns
for col in numeric_cols:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True, bins=30, color="blue")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

# Correlation Matrix and Heatmap
correlation_matrix = df[numeric_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Scatter Plots for Numeric Relationships
# Reducing dataset for scatter plot to avoid legend issues
subset_df = df.sample(1000)  # Adjust the sample size as necessary
for i in range(len(numeric_cols)):
    for j in range(i + 1, len(numeric_cols)):
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=subset_df, x=numeric_cols[i], y=numeric_cols[j], hue='position', palette="tab10", legend=False)
        plt.title(f"Scatter Plot: {numeric_cols[i]} vs {numeric_cols[j]}")
        plt.tight_layout()
        plt.show()

# --------------------------------------
# Feature Importance Analysis using Mutual Information
# --------------------------------------

# Encode categorical features for mutual information analysis
encoded_df = df.copy()
le = LabelEncoder()
for col in categorical_cols:
    encoded_df[col] = le.fit_transform(df[col])

# Specify a target column for MI analysis
target_column = 'res_win'  # Replace with your actual target column
if target_column in df.columns:
    mi_scores = mutual_info_classif(encoded_df[numeric_cols], encoded_df[target_column], discrete_features=False)
    mi_df = pd.DataFrame({'Feature': numeric_cols, 'Mutual Information': mi_scores})
    mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

    # Plot Mutual Information Scores
    plt.figure(figsize=(12, 8))
    sns.barplot(data=mi_df, x='Mutual Information', y='Feature', palette='viridis')
    plt.title("Feature Importance using Mutual Information")
    plt.xlabel("Mutual Information Score")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.show()
else:
    print("Target column not found for mutual information analysis.")

# --------------------------------------
# Interactive Plots using Plotly
# --------------------------------------

# Interactive plot for top 20 courses by count
fig = px.bar(course_counts.head(20), x='course', y='count', title='Top 20 Courses by Count')
fig.show()

# Interactive plot for top 20 trainers by count
fig = px.bar(trainer_counts.head(20), x='trainerName', y='count', title='Top 20 Trainers by Count')
fig.show()

# Interactive plot for top 20 jockeys by count
fig = px.bar(jockey_counts.head(20), x='jockeyName', y='count', title='Top 20 Jockeys by Count')
fig.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import pickle
import os
import json

# Paths
dataset_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse/cleaned_final_dataset.csv'
save_path = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'
model_save_path = os.path.join(save_path, 'horse_model.pkl')
best_params_path = os.path.join(save_path, 'best_params.json')
evaluation_metrics_path = os.path.join(save_path, 'evaluation_metrics.txt')
mappings_file_path = os.path.join(save_path, 'mappings', 'all_mappings.json')

# Function to optimize memory usage
def optimize_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type == 'float64' or col_type == 'float32':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'int64' or col_type == 'int32':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

# Load mappings
with open(mappings_file_path, 'r') as f:
    mappings = json.load(f)

# Load data in chunks
chunk_size = 100000
chunks = pd.read_csv(
    dataset_path, 
    chunksize=chunk_size, 
    low_memory=False
)

# Check if mappings are already applied
sample_chunk = pd.read_csv(dataset_path, nrows=5)
is_trainer_encoded = sample_chunk['trainerName'].dtype in ['int64', 'float64']
is_jockey_encoded = sample_chunk['jockeyName'].dtype in ['int64', 'float64']

if is_trainer_encoded and is_jockey_encoded:
    print("Trainer and Jockey mappings are already applied in the dataset. Skipping encoding step.")
else:
    print("Trainer and Jockey mappings are not applied. Encoding now.")

# Process and combine chunks
processed_chunks = []
for chunk in chunks:
    # Drop irrelevant columns
    chunk.drop(['horseName', 'date', 'rid'], axis=1, inplace=True)
    
    # Apply mappings only if not already encoded
    if not is_trainer_encoded:
        chunk['trainerName'] = chunk['trainerName'].map(lambda x: mappings['trainerName'].get(str(x), -1))
    if not is_jockey_encoded:
        chunk['jockeyName'] = chunk['jockeyName'].map(lambda x: mappings['jockeyName'].get(str(x), -1))
    
    # Encode other categorical columns
    if 'course' in chunk.columns:
        chunk['course'] = chunk['course'].map(lambda x: mappings.get('course', {}).get(str(x), -1))
    
    # Feature Engineering
    chunk['speed_ratio'] = chunk['distance'] / (chunk['position'] + 1)  # Example feature
    chunk['success_rate'] = chunk['RPR'] / (chunk['TR'] + 1)  # Prevent division by zero
    
    # Replace infinities and handle missing values
    chunk.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Separate numeric and non-numeric columns
    numeric_cols = chunk.select_dtypes(include=[np.number]).columns
    non_numeric_cols = chunk.select_dtypes(exclude=[np.number]).columns

    # Handle missing values separately
    chunk[numeric_cols] = chunk[numeric_cols].fillna(chunk[numeric_cols].median())
    chunk[non_numeric_cols] = chunk[non_numeric_cols].fillna("Unknown")
    
    # Optimize memory
    chunk = optimize_memory(chunk)
    
    processed_chunks.append(chunk)

# Combine processed chunks
data = pd.concat(processed_chunks, ignore_index=True)

# Drop low-variance features
low_variance_cols = [col for col in data.columns if data[col].nunique() == 1]
data.drop(columns=low_variance_cols, inplace=True)

# Define target and features
X = data.drop(['res_win'], axis=1)
y = data['res_win']

# Scale features (numeric columns only)
scaler = StandardScaler()
numeric_features = X.select_dtypes(include=[np.number]).columns
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier with Hyperparameter Tuning
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best model and parameters
best_rf = random_search.best_estimator_
best_params = random_search.best_params_

# Print and save the best parameters
print("Best Parameters:", best_params)
with open(best_params_path, 'w') as f:
    json.dump(best_params, f, indent=4)
print(f"Best parameters saved to: {best_params_path}")

# Evaluate the model
y_pred = best_rf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

# Log evaluation metrics
with open(evaluation_metrics_path, 'w') as f:
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(conf_matrix))
    f.write("\n\nClassification Report:\n")
    f.write(class_report)
print(f"Evaluation metrics saved to: {evaluation_metrics_path}")

# Save the model
with open(model_save_path, 'wb') as f:
    pickle.dump(best_rf, f)

print("Model saved to:", model_save_path)

# Fitting 3 folds for each of 50 candidates, totalling 150 fits
# Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': False}
# Best parameters saved to: D:/GUVI_Projects/My_Projects/new_horse/Horse\best_params.json
# Confusion Matrix:
#  [[741781      0]
#  [     0  79684]]

# Classification Report:
#                precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    741781
#          1.0       1.00      1.00      1.00     79684

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465

# Evaluation metrics saved to: D:/GUVI_Projects/My_Projects/new_horse/Horse\evaluation_metrics.txt
# Model saved to: D:/GUVI_Projects/My_Projects/new_horse/Horse\horse_model.pkl

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle
import os
import logging
import traceback
import json

# Set up logging
log_dir = 'D:/GUVI_Projects/My_Projects/new_horse/Horse'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'script_log.txt')
logging.basicConfig(filename=log_file, level=logging.DEBUG,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Paths
dataset_path = os.path.join(log_dir, 'cleaned_final_dataset.csv')
cleaned_dataset_path = os.path.join(log_dir, 'cleaned_dataset_with_mappings.csv')
mappings_file_path = os.path.join(log_dir, 'mappings.json')
gb_model_save_path = os.path.join(log_dir, 'horse_gb_model.pkl')
lr_model_save_path = os.path.join(log_dir, 'horse_lr_model.pkl')
linear_model_save_path = os.path.join(log_dir, 'horse_linear_model.pkl')
scaler_save_path = os.path.join(log_dir, 'scaler.pkl')

# Function to log exceptions
def log_exception(exc):
    logging.error(f"Exception occurred: {exc}")
    logging.error("".join(traceback.format_exception(None, exc, exc.__traceback__)))

# Function to clean and save the dataset
def clean_dataset(data):
    try:
        print("Cleaning dataset...")
        for col in data.columns:
            if pd.api.types.is_datetime64_any_dtype(data[col]) or 'date' in col.lower():
                print(f"Cleaning date column: {col}")
                # Specify the expected format, e.g., '%Y-%m-%d' for '2024-12-31'
                try:
                    data[col] = pd.to_datetime(data[col], format='%Y-%m-%d', errors='coerce')
                except ValueError:
                    # Fallback to auto-detection if the format is unknown
                    data[col] = pd.to_datetime(data[col], errors='coerce')
                min_date = data[col].min()
                data[col] = data[col].fillna(min_date)
            elif data[col].dtype == 'object':
                print(f"Cleaning categorical column: {col}")
                data[col] = data[col].fillna('Unknown')
            else:
                print(f"Cleaning numeric column: {col}")
                data[col] = pd.to_numeric(data[col], errors='coerce')
                median_value = data[col].median()
                data[col] = data[col].fillna(median_value)

        # Feature Engineering
        if 'distance' in data.columns and 'position' in data.columns:
            data['speed_ratio'] = data['distance'] / (data['position'] + 1)
        if 'RPR' in data.columns and 'TR' in data.columns:
            data['success_rate'] = data['RPR'] / (data['TR'] + 1)

        print("Dataset cleaned successfully. Saving...")
        data.to_csv(dataset_path, index=False)
        logging.info("Dataset cleaned and saved successfully.")
        return data
    except Exception as e:
        log_exception(e)
        raise

# Function to create mappings for categorical columns
def create_mappings(data, categorical_columns):
    try:
        mappings = {}
        for col in categorical_columns:
            print(f"Creating mappings for column: {col}")
            unique_mapping = {val: idx for idx, val in enumerate(data[col].astype(str).unique())}
            mappings[col] = unique_mapping
            data[col] = data[col].map(unique_mapping)
        with open(mappings_file_path, 'w') as f:
            json.dump(mappings, f, indent=4)
        logging.info("Mappings created and saved successfully.")
        return data
    except Exception as e:
        log_exception(e)
        raise

# Load and clean dataset
try:
    data = pd.read_csv(dataset_path)
    data = clean_dataset(data)
    categorical_columns = ['horseName', 'trainerName', 'jockeyName', 'course']
    data = create_mappings(data, categorical_columns)
    data.to_csv(cleaned_dataset_path, index=False)
except Exception as e:
    log_exception(e)
    raise

# Model training and evaluation
try:
    # Define target and features
    X = data.drop(['res_win'], axis=1)
    y = data['res_win']

    # Identify numeric columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    X_numeric = X[numeric_features]

    # Log sample sizes
    print(f"Dataset samples: {len(data)}, Training samples: {int(len(X_numeric) * 0.8)}, Testing samples: {int(len(X_numeric) * 0.2)}")
    logging.info(f"Dataset samples: {len(data)}, Training samples: {int(len(X_numeric) * 0.8)}, Testing samples: {int(len(X_numeric) * 0.2)}")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_numeric, y, test_size=0.2, random_state=42, stratify=y
    )

    # Preprocessing pipeline
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    with open(scaler_save_path, 'wb') as f:
        pickle.dump(scaler, f)

    # Balancing the dataset with SMOTETomek and undersampling
    smote_tomek = SMOTETomek(random_state=42)
    undersampler = RandomUnderSampler(random_state=42)

    # Gradient Boosting Classifier with enhanced regularization
    gb = GradientBoostingClassifier(random_state=42)
    gb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 3]
    }
    gb_random_search = RandomizedSearchCV(
        gb, gb_param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1, verbose=2
    )
    gb_pipeline = Pipeline(steps=[
        ('undersample', undersampler),
        ('smote', smote_tomek),
        ('classifier', gb_random_search)
    ])
    gb_pipeline.fit(X_train_scaled, y_train)
    best_gb_model = gb_random_search.best_estimator_
    print("\nBest Parameters for Gradient Boosting:")
    print(gb_random_search.best_params_)
    with open(gb_model_save_path, 'wb') as f:
        pickle.dump(best_gb_model, f)

    # Logistic Regression with enhanced regularization
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr_param_grid = {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    lr_random_search = RandomizedSearchCV(
        lr, lr_param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1, verbose=2
    )
    lr_pipeline = Pipeline(steps=[
        ('undersample', undersampler),
        ('smote', smote_tomek),
        ('classifier', lr_random_search)
    ])
    lr_pipeline.fit(X_train_scaled, y_train)
    best_lr_model = lr_random_search.best_estimator_
    print("\nBest Parameters for Logistic Regression:")
    print(lr_random_search.best_params_)
    with open(lr_model_save_path, 'wb') as f:
        pickle.dump(best_lr_model, f)

    # Ridge Regression (L2 regularization)
    ridge_model = Ridge(alpha=1.0)  # Regularization strength
    ridge_model.fit(X_train_scaled, y_train)
    with open(linear_model_save_path, 'wb') as f:
        pickle.dump(ridge_model, f)

    logging.info("All models trained and saved successfully.")
except Exception as e:
    log_exception(e)
    raise

# Model evaluation
try:
    # Gradient Boosting
    y_pred_gb = best_gb_model.predict(X_test_scaled)
    print("\nConfusion Matrix for Gradient Boosting:")
    print(confusion_matrix(y_test, y_pred_gb))
    print("\nClassification Report for Gradient Boosting:")
    print(classification_report(y_test, y_pred_gb))

    # Logistic Regression
    y_pred_lr = best_lr_model.predict(X_test_scaled)
    print("\nConfusion Matrix for Logistic Regression:")
    print(confusion_matrix(y_test, y_pred_lr))
    print("\nClassification Report for Logistic Regression:")
    print(classification_report(y_test, y_pred_lr))

    # Ridge Regression
    y_pred_ridge = ridge_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred_ridge)
    r2 = r2_score(y_test, y_pred_ridge)
    print("\nRidge Regression Performance:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
except Exception as e:
    log_exception(e)
    raise
# Best Parameters for Gradient Boosting:
# {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5, 'learning_rate': 0.1}
# Fitting 5 folds for each of 10 candidates, totalling 50 fits

# Best Parameters for Logistic Regression:
# {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.01}

# Confusion Matrix for Gradient Boosting:
# [[742297      0]
#  [     0  79168]]

# Classification Report for Gradient Boosting:
#               precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    742297
#          1.0       1.00      1.00      1.00     79168

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465


# Confusion Matrix for Logistic Regression:
# [[742187    110]
#  [     0  79168]]

# Classification Report for Logistic Regression:
#               precision    recall  f1-score   support

#          0.0       1.00      1.00      1.00    742297
#          1.0       1.00      1.00      1.00     79168

#     accuracy                           1.00    821465
#    macro avg       1.00      1.00      1.00    821465
# weighted avg       1.00      1.00      1.00    821465


# Ridge Regression Performance:
# Mean Squared Error: 0.0406
# R² Score: 0.5340
