In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/AmEx/processed_train_f.csv")
test_data = pd.read_csv("/content/drive/MyDrive/AmEx/processed_test_f.csv")
round2_data = pd.read_csv("/content/drive/MyDrive/AmEx/round2_processed_f.csv")
print(df_train.shape, test_data.shape, round2_data.shape)

(948, 197) (271, 196) (207, 196)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Assuming df_train is already defined

# Columns to exclude from normalization
exclude_columns = ['team1_id', 'match_id', 'team2_id', 'ground_id', 'winner']

# Identify numerical columns
numerical_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()

# Select columns to normalize (excluding specified columns)
columns_to_normalize = [col for col in numerical_columns if col not in exclude_columns]

# Initialize MinMaxScaler
scaler = MinMaxScalar()

# Fit and transform the data for normalization
df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])

# Apply log transform to the normalized columns
df_train[columns_to_normalize] = np.log1p(df_train[columns_to_normalize])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Columns to exclude from normalization
exclude_columns = ['team1_id', 'match_id', 'team2_id', 'ground_id', 'winner']

# Identify numerical columns
numerical_columns = test_data.select_dtypes(include=[np.number]).columns.tolist()

# Select columns to normalize (excluding specified columns)
columns_to_normalize = [col for col in numerical_columns if col not in exclude_columns]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data for normalization
test_data[columns_to_normalize] = scaler.fit_transform(test_data[columns_to_normalize])

# Apply log transform to the normalized columns
test_data[columns_to_normalize] = np.log1p(test_data[columns_to_normalize])


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp

def check_distribution(train_df, test_df, features):
    similar_features = []
    different_features = []

    for feature in features:
        if feature == 'winner':
            continue  # Skip the target column
        train_data = train_df[feature]
        test_data = test_df[feature]
        stat, p_value = ks_2samp(train_data, test_data)

        if p_value < 0.05:  # Statistical significance threshold
            print(f"Feature {feature} has different distributions in train and test sets (p-value: {p_value:.5f})")
            different_features.append(feature)
        else:
            similar_features.append(feature)

    return similar_features, different_features

similar_features, different_features = check_distribution(df_train, test_data, df_train.columns)

Feature match id has different distributions in train and test sets (p-value: 0.00000)
Feature toss winner has different distributions in train and test sets (p-value: 0.00000)
Feature toss decision has different distributions in train and test sets (p-value: 0.00000)
Feature lighting has different distributions in train and test sets (p-value: 0.00000)
Feature team_count_50runs_last15 has different distributions in train and test sets (p-value: 0.00000)
Feature team_winp_last5 has different distributions in train and test sets (p-value: 0.00000)
Feature team1only_avg_runs_last15 has different distributions in train and test sets (p-value: 0.00000)
Feature ground_avg_runs_last15 has different distributions in train and test sets (p-value: 0.00000)
Feature avg_inning1_runs_venue has different distributions in train and test sets (p-value: 0.00001)
Feature avg_inning1_runs_venue_last5 has different distributions in train and test sets (p-value: 0.00003)
Feature avg_inning2_runs_venue has

In [None]:
print(similar_features)
print(len(similar_features))
print(different_features)
print(len(different_features))

['team1', 'team1_id', 'team1_roster_ids', 'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city', 'match_dt', 'series_name', 'season', 'ground_id', 'team1_winp_team2_last15', 'avg_inning1_wickets_venue', 'avg_inning1_wickets_venue_last5', 'avg_inning2_wickets_venue', 'avg_inning2_wickets_venue_last5', 'victory_by_wickets_team1', 'victory_by_wickets_team1_last5', 'inning1_avg_wickets_team1', 'inning1_avg_wickets_team2', 'inning1_avg_wickets_team1_last5', 'inning1_avg_wickets_team2_last5', 'inning2_avg_wickets_team1', 'inning2_avg_wickets_team1_last5', 'inning2_avg_wickets_team2_last5', 'team1_won_in_past', 'team1_won_in_past_last5', 'team2_won_in_past_last5', 'team2_win_lighting1', 'team1_win_lighting1_last5', 'team2_win_lighting1_last5', 'team1_win_lighting2', 'team2_win_lighting2', 'team1_win_lighting2_last5', 'team2_win_lighting2_last5', 'team1_day match_wickets_avg', 'team1_night match_wickets_avg', 'team1_day/night match_wickets_avg', 'team2_night match_wickets_avg', 'team2_day/n

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

def calculate_kl_divergence(train_feature, test_feature):
    # Compute value counts and normalize
    train_counts = train_feature.value_counts(normalize=True)
    test_counts = test_feature.value_counts(normalize=True)

    # Align the indices to handle missing values if any
    all_index = train_counts.index.union(test_counts.index)
    train_counts = train_counts.reindex(all_index, fill_value=0)
    test_counts = test_counts.reindex(all_index, fill_value=0)

    # Calculate KL divergence
    kl_div = entropy(train_counts, test_counts)

    return kl_div

def remove_different_distribution_features(df_train, df_test, threshold=0.1):
    features_to_remove = []

    # Iterate through each feature
    for feature in df_train.columns:
        # Calculate KL divergence for the feature
        if feature == 'winner':
          continue
        kl_div = calculate_kl_divergence(df_train[feature], df_test[feature])

        # Compare with threshold
        if kl_div > threshold:
            features_to_remove.append(feature)

    # Remove features from both dataframes
    df_train_filtered = df_train.drop(columns=features_to_remove)
    df_test_filtered = df_test.drop(columns=features_to_remove)

    return df_train_filtered, df_test_filtered, features_to_remove

# Example usage:
df_train_filtered, df_test_filtered, removed_features = remove_different_distribution_features(df_train, test_data, threshold=0.1)

# Printing removed features
print("Removed features with significantly different distributions:")
print(removed_features)
print(len(removed_features))

Removed features with significantly different distributions:
['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2', 'team2_id', 'team2_roster_ids', 'toss winner', 'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 'ground_avg_runs_last15', 'avg_inning1_runs_venue', 'avg_inning1_runs_venue_last5', 'avg_inning2_runs_venue', 'avg_inning2_runs_venue_last5', 'avg_inning1_wickets_venue', 'avg_inning1_wickets_venue_last5', 'avg_inning2_wickets_venue', 'avg_inning2_wickets_venue_last5', 'victory_by_runs_team1', 'victory_by_runs_team2', 'victory_by_wickets_team1', 'victory_by_wickets_team2', 'victory_by_runs_team1_last5', 'victory_by_runs_team2_last5', 'victory_by_wickets_team1_last5', 'victory_by_wickets_team2_last5', 'inning1_avg_runs_team1', 'inning1_avg_runs_team2', 'inning1_avg_runs_team1_last5', 'inning1_avg_runs_team2_last5', 'inning2_avg_runs_team1', 'inning2_avg_runs_t

In [None]:
!pip install catboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# Assuming df_train and 'winner' column exist in your dataset

# Select numerical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Exclude the target column from numerical_cols if needed
if 'winner' in numerical_cols:
    numerical_cols = numerical_cols.drop('winner')

# Split the data into train, test, and validation sets with stratification
X = df_train[numerical_cols].drop(columns=['team1_id', 'match id', 'team2_id', 'ground_id'])
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the CatBoost classifier with default parameters
model = CatBoostClassifier(verbose=False)

# Train the model on the training set
model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

# Perform cross-validation on the validation set and report min, max, and average scores
cv_scores = cross_val_score(model, X_val, y_val, cv=5, scoring='accuracy')
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

feature_importances = model.get_feature_importance()
ft_imp_cat = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

CV Scores - Avg: 0.8236453201970443, Min: 0.75, Max: 0.9310344827586207
Test Accuracy: 0.8741258741258742


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Assuming df_train and 'winner' column exist in your dataset

# Select numerical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Exclude the target column from numerical_cols if needed
if 'winner' in numerical_cols:
    numerical_cols = numerical_cols.drop('winner')

# Exclude additional columns if necessary
numerical_cols = numerical_cols.drop(['team1_id', 'team2_id', 'ground_id', 'match id'])

# Split the data into train, test, and validation sets with stratification
X = df_train[numerical_cols]
y = df_train['winner']

# Split the data with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the XGBoost classifier with default parameters
model = XGBClassifier()

# Train the model on the training set
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Perform cross-validation on the validation set and report min, max, and average scores
cv_scores = cross_val_score(model, X_val, y_val, cv=5, scoring='accuracy')
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Get feature importances
feature_importances = model.feature_importances_
ft_imp_xg = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

CV Scores - Avg: 0.810344827586207, Min: 0.7586206896551724, Max: 0.8928571428571429
Test Accuracy: 0.8811188811188811


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# Assuming df_train and 'winner' column exist in your dataset

# Select numerical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Exclude the target column from numerical_cols if needed
if 'winner' in numerical_cols:
    numerical_cols = numerical_cols.drop('winner')

# Exclude additional columns if necessary
numerical_cols = numerical_cols.drop(['team1_id', 'team2_id', 'ground_id', 'match id'])

# Split the data into train, test, and validation sets with stratification
X = df_train[numerical_cols]
y = df_train['winner']

# Split the data with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the LightGBM classifier with default parameters
model = lgb.LGBMClassifier()

# Train the model on the training set
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

# Perform cross-validation on the validation set and report min, max, and average scores
cv_scores = cross_val_score(model, X_val, y_val, cv=5, scoring='accuracy')
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy : {test_accuracy}")

# Get feature importances
feature_importances_light = model.feature_importances_
ft_imp_light = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances_light})

[LightGBM] [Info] Number of positive: 329, number of negative: 334
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22919
[LightGBM] [Info] Number of data points in the train set: 663, number of used features: 178
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496229 -> initscore=-0.015083
[LightGBM] [Info] Start training from score -0.015083
[LightGBM] [Info] Number of positive: 57, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4895
[LightGBM] [Info] Number of data points in the train set: 113, number of used features: 178
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504425 -> initscore=0.017700
[LightGBM] [Info] Start training from score 0.017700
[LightGBM] [Info] Number 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier


# Select numerical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Exclude the target column from numerical_cols if needed
if 'winner' in numerical_cols:
    numerical_cols = numerical_cols.drop('winner')

# Exclude additional columns if necessary
numerical_cols = numerical_cols.drop(['team1_id', 'team2_id', 'ground_id', 'match id'])

# Split the data into train, test, and validation sets with stratification
X = df_train[numerical_cols]
y = df_train['winner']

# Split the data with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the GBM classifier with default parameters
model = GradientBoostingClassifier()

# Train the model on the training set
model.fit(X_train, y_train)

# Perform cross-validation on the validation set and report min, max, and average scores
cv_scores = cross_val_score(model, X_val, y_val, cv=5, scoring='accuracy')
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy  : {test_accuracy}")

# Get feature importances
feature_importances_gbm = model.feature_importances_
ft_imp_gbm = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances_gbm})

CV Scores - Avg: 0.5637931034482758, Min: 0.5, Max: 0.6785714285714286
Test Accuracy  : 0.4825174825174825


In [None]:
from sklearn.preprocessing import MinMaxScaler
def ft_imp_avg(ft_imp_cat, ft_imp_xg, ft_imp_light, ft_imp_gbm):
    # Create a MinMaxScaler instance
    scaler = MinMaxScaler()

    # Normalize the importance values in each dataframe
    ft_imp_cat['Normalized_Importance_cat'] = scaler.fit_transform(ft_imp_cat[['Importance']])
    ft_imp_xg['Normalized_Importance_xg'] = scaler.fit_transform(ft_imp_xg[['Importance']])
    ft_imp_light['Normalized_Importance_light'] = scaler.fit_transform(ft_imp_light[['Importance']])
    ft_imp_gbm['Normalized_Importance_gbm'] = scaler.fit_transform(ft_imp_gbm[['Importance']])

    # Merge the dataframes on the 'Feature' column
    merged_df = ft_imp_cat[['Feature', 'Normalized_Importance_cat']].merge(
        ft_imp_xg[['Feature', 'Normalized_Importance_xg']], on='Feature'
    ).merge(
        ft_imp_light[['Feature', 'Normalized_Importance_light']], on='Feature'
    ).merge(
        ft_imp_gbm[['Feature', 'Normalized_Importance_gbm']], on='Feature'
    )

    # Calculate the average normalized importance for each feature
    merged_df['Avg_Importance'] = merged_df[['Normalized_Importance_cat', 'Normalized_Importance_xg', 'Normalized_Importance_light', 'Normalized_Importance_gbm']].mean(axis=1)

    # Return a dataframe with the average feature importance
    avg_ft_imp = merged_df[['Feature', 'Avg_Importance']].sort_values(by='Avg_Importance', ascending=False)

    return avg_ft_imp

feat_imp_average = ft_imp_avg(ft_imp_cat, ft_imp_xg, ft_imp_light, ft_imp_gbm)
print(feat_imp_average)

                                               Feature  Avg_Importance
148                          Sum_batting-bowling_team1        0.641454
132                       Sum_of_BowlerAbilities_team1        0.610273
107  team1_day/night match_runs_avg - team2_day/nig...        0.550650
24                              inning1_avg_runs_team1        0.541103
155                performance_under_pressure_relative        0.495763
..                                                 ...             ...
131                        team1_toss_based_win_chance        0.013864
78               team1_wickets_avg_inning2_venue_last5        0.012145
79               team2_wickets_avg_inning2_venue_last5        0.007213
1                                        toss decision        0.000000
157                          team2_wins_vs_team1_venue        0.000000

[173 rows x 2 columns]


In [None]:
def find_redundant_features(df_train, avg_feature_importance):
    # Create a correlation matrix for df_train
    numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Drop columns that are not features
    features = df_train[numerical_cols].drop(columns=['match id', 'team1_id', 'team2_id', 'ground_id', 'winner'])
    corr_matrix = features.corr().abs()

    # Initialize a set to track features to keep
    features_to_keep = set(features.columns)

    # Iterate through the correlation matrix to find highly correlated pairs
    correlated_pairs = []
    cols = corr_matrix.columns
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            if corr_matrix.iloc[i, j] > 0.9:  # Adjust threshold as needed
                correlated_pairs.append((cols[i], cols[j]))

    # Remove less important features from each correlated pair
    for pair in correlated_pairs:
        feature1, feature2 = pair

        # Find importance values from avg_feature_importance
        importance1 = avg_feature_importance.loc[avg_feature_importance['Feature'] == feature1, 'Avg_Importance']
        importance2 = avg_feature_importance.loc[avg_feature_importance['Feature'] == feature2, 'Avg_Importance']

        # Check if both features exist in avg_feature_importance
        if not importance1.empty and not importance2.empty:
            importance1 = importance1.values[0]
            importance2 = importance2.values[0]

            # Remove the less important feature from features_to_keep
            if importance1 < importance2:
                features_to_keep.discard(feature1)
            else:
                features_to_keep.discard(feature2)

    return list(features_to_keep)

In [None]:
features_to_keep = find_redundant_features(df_train, feat_imp_average)
print(features_to_keep)
print(len(features_to_keep))

['victory_by_runs_team2_last5', 'team1_runs_avg_inning2_venue - team2_runs_avg_inning2_venue', 'team2_won_in_past_last5', 'team1_vs_team2_avg_inning1_wickets', 'team1_vs_team2_avg_inning2_wickets', 'team1_win_lighting1', 'Sum_of_BowlerAbilities_team2', 'team1_day match_wickets_avg - team2_day match_wickets_avg', 'team2_batting_chance_winning_VenueVise', 'victory_by_wickets_team2_last5', 'team1_win_lighting2_last5 - team2_win_lighting2_last5', 'team2_vs_team1_avg_inning1_wickets', 'team1_bowling_chance_winning_VenueVise', 'inning2_avg_runs_team1_last5 - inning2_avg_runs_team2_last5', 'team1_day match_runs_avg', 'top_bowler3_team2', 'relative_batting_chance_formvise', 'inning1_avg_runs_team2', 'top_batsman1_team1', 'team1_win_lighting2_last5', 'team1_won_in_past', 'team1_won_in_past_last5', 'past_head_on_vs_bowling', 'team1_performance_under_Pressure', 'top_bowler2_team2', 'victory_by_wickets_team1_last5 - victory_by_wickets_team2_last5', 'victory_by_wickets_team1', 'lighting', 'team1_da

In [None]:
def get_top_features_cumulative_importance(avg_feature_importance, cumulative_threshold=0.95):
    # Sort the DataFrame by importance in descending order
    avg_feature_importance_sorted = avg_feature_importance.sort_values(by='Avg_Importance', ascending=False)

    # Calculate the cumulative importance
    avg_feature_importance_sorted['Cumulative_importance'] = avg_feature_importance_sorted['Avg_Importance'].cumsum()

    # Find the threshold where cumulative importance reaches the desired level (e.g., 95%)
    threshold_value = cumulative_threshold * avg_feature_importance_sorted['Avg_Importance'].sum()

    # Select the features that contribute to the cumulative importance threshold
    top_features_df = avg_feature_importance_sorted[avg_feature_importance_sorted['Cumulative_importance'] <= threshold_value]

    # Extract the list of feature names
    top_features_list = top_features_df['Feature'].tolist()

    return top_features_list

# Ensure features_to_keep only contains features present in feat_imp_average
valid_features = [feature for feature in features_to_keep if feature in feat_imp_average['Feature'].values]

# Filter the avg_feature_importance DataFrame
filtered_feat_imp_average = feat_imp_average[feat_imp_average['Feature'].isin(valid_features)]

# Get top features based on cumulative importance
top_features_cumulative = get_top_features_cumulative_importance(filtered_feat_imp_average, cumulative_threshold=0.95)

print(top_features_cumulative)
print(len(top_features_cumulative))

['Sum_batting-bowling_team1', 'Sum_of_BowlerAbilities_team1', 'team1_day/night match_runs_avg - team2_day/night match_runs_avg', 'inning1_avg_runs_team1', 'performance_under_pressure_relative', 'top_batsman3_team1', 'team_count_50runs_last15', 'victory_by_wickets_team2', 'top_bowler1_team1', 'nrr_team1 - nrr_team2', 'team1_day match_wickets_avg - team2_day match_wickets_avg', 'inning2_avg_wickets_team2', 'nrr_team1_last5', 'victory_by_wickets_team2_last5', 'top_batsman2_team1', 'Bowling_ability_diffrence', 'team1_night match_wickets_avg', 'victory_by_runs_team1_last5', 'team2_won_in_past_last5', 'team1_vs_team2_avg_inning2_runs', 'nrr_team2_last5', 'nrr_team2', 'nrr_team1', 'team1_win_lighting1 - team2_win_lighting1', 'team1_won_in_past_last5 - team2_won_in_past_last5', 'inning2_avg_runs_team1', 'Sum_of_BatsmanAbilities_team2', 'top_batsman1_team1', 'top_bowler2_team1', 'team1_day match_runs_avg - team2_day match_runs_avg', 'top_bowler2_team2', 'top_batsman1_team2', 'top_bowler3_team2'

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler

# Split the data into train, test, and validation sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Optuna for all hyperparameters
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 10),
        'iterations': trial.suggest_int('iterations', 100, 500),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20),
        'eval_metric': 'Accuracy',
        'early_stopping_rounds': 50,
        'verbose': False
    }

    model = CatBoostClassifier(**params)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.mean()

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50, n_jobs=-1)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train and evaluate the final model with combined best hyperparameters
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_model, X_val, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Print final results
print(f"Best Parameters: {best_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")

[I 2024-06-22 12:51:21,906] A new study created in memory with name: no-name-6334e69d-6dd7-4b3d-b9f8-d29ac8cd436e
[I 2024-06-22 13:00:56,834] Trial 0 finished with value: 0.841660970608339 and parameters: {'depth': 10, 'iterations': 322, 'l2_leaf_reg': 5, 'learning_rate': 0.28401054903414746, 'subsample': 0.6468835912920555, 'bagging_temperature': 0.2964335175303001, 'border_count': 169, 'one_hot_max_size': 17}. Best is trial 0 with value: 0.841660970608339.
[I 2024-06-22 13:02:50,603] Trial 2 finished with value: 0.8401458190931874 and parameters: {'depth': 8, 'iterations': 205, 'l2_leaf_reg': 1, 'learning_rate': 0.1404060069003133, 'subsample': 0.6247895881370579, 'bagging_temperature': 0.4595844708702308, 'border_count': 238, 'one_hot_max_size': 12}. Best is trial 0 with value: 0.841660970608339.
[I 2024-06-22 13:02:51,782] Trial 1 finished with value: 0.8446684894053315 and parameters: {'depth': 10, 'iterations': 438, 'l2_leaf_reg': 2, 'learning_rate': 0.08285329884599107, 'subsamp

KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler

# Split the data into train, test, and validation sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Optuna for all hyperparameters
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'eval_metric': 'logloss',
        'use_label_encoder': False
    }

    model = XGBClassifier(**params)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.mean()

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50, n_jobs=-1)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train and evaluate the final model with combined best hyperparameters
final_model = XGBClassifier(**best_params, use_label_encoder=False)
final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_model, X_val, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Print final results
print(f"Best Parameters: {best_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler

# Split the data into train, test, and validation sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Optuna for all hyperparameters
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'metric': 'accuracy',
        'verbosity': -1
    }

    model = lgb.LGBMClassifier(**params)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.mean()

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50, n_jobs=-1)

best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train and evaluate the final model with combined best hyperparameters
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_model, X_val, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Print final results
print(f"Best Parameters: {best_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
import optuna
from optuna.samplers import TPESampler

# Assuming df_train, top_features_cumulative are defined appropriately

# Split the data into train, test, and validation sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Define the objective function for Optuna optimization
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.6, 1.0),
    }

    model = GradientBoostingClassifier(**params)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.mean()

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=50, n_jobs=-1)

# Get the best hyperparameters from the study
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train the final model with the best hyperparameters
final_model = GradientBoostingClassifier(**best_params)
final_model.fit(X_train, y_train)

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_model, X_val, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the model on the test set
y_pred_test = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Print final results
print(f"Best Parameters: {best_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
# catboost_params = {'depth': 4, 'iterations': 131, 'l2_leaf_reg': 9, 'learning_rate': 0.026746858922480903, 'subsample': 0.9806922046593372, 'bagging_temperature': 0.863851083829466, 'border_count': 79, 'one_hot_max_size': 10}
# xgboost_params = {'max_depth': 5, 'n_estimators': 269, 'learning_rate': 0.1497059900832021, 'subsample': 0.6134373870090926, 'colsample_bytree': 0.8655799077621021, 'gamma': 3.39838763588673, 'min_child_weight': 10, 'reg_alpha': 5.895160858400561, 'reg_lambda': 2.0436192434364866}
# lightgbm_params = {'max_depth': 3, 'num_leaves': 165, 'learning_rate': 0.15387125194824516, 'subsample': 0.7701573425754059, 'colsample_bytree': 0.8740741726764223, 'reg_alpha': 5.967689395391898, 'reg_lambda': 5.552505199267917, 'n_estimators': 294}
# gbm_params = {'learning_rate': 0.013177244157007226, 'n_estimators': 223, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.677756768922895}

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
import optuna
from optuna.samplers import TPESampler

# Assuming df_train, top_features_cumulative are defined appropriately

# Split the data into train, test, and validation sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Define the individual models with their best parameters
catboost_params = {'depth': 4, 'iterations': 131, 'l2_leaf_reg': 9, 'learning_rate': 0.026746858922480903, 'subsample': 0.9806922046593372, 'bagging_temperature': 0.863851083829466, 'border_count': 79, 'one_hot_max_size': 10}
xgboost_params = {'max_depth': 5, 'n_estimators': 269, 'learning_rate': 0.1497059900832021, 'subsample': 0.6134373870090926, 'colsample_bytree': 0.8655799077621021, 'gamma': 3.39838763588673, 'min_child_weight': 10, 'reg_alpha': 5.895160858400561, 'reg_lambda': 2.0436192434364866}
lightgbm_params = {'max_depth': 3, 'num_leaves': 165, 'learning_rate': 0.15387125194824516, 'subsample': 0.7701573425754059, 'colsample_bytree': 0.8740741726764223, 'reg_alpha': 5.967689395391898, 'reg_lambda': 5.552505199267917, 'n_estimators': 294}
gbm_params = {'learning_rate': 0.013177244157007226, 'n_estimators': 223, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.677756768922895}

cat_model = CatBoostClassifier(**catboost_params, verbose=False)
xgb_model = XGBClassifier(**xgboost_params)
lgb_model = LGBMClassifier(**lightgbm_params)
gbm_model = GradientBoostingClassifier(**gbm_params)

# Train the individual models on the training set
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)

# Get the predictions on the validation set for the meta-model
X_val_meta = pd.DataFrame({
    'cat_pred': cat_model.predict(X_val),
    'xgb_pred': xgb_model.predict(X_val),
    'lgb_pred': lgb_model.predict(X_val),
    'gbm_pred': gbm_model.predict(X_val)
})

# Get the predictions on the test set for the meta-model
X_test_meta = pd.DataFrame({
    'cat_pred': cat_model.predict(X_test),
    'xgb_pred': xgb_model.predict(X_test),
    'lgb_pred': lgb_model.predict(X_test),
    'gbm_pred': gbm_model.predict(X_test)
})

# Define the objective function for tuning the meta-model using Optuna
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_float('max_features', 0.6, 1.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0)
    }

    model = GradientBoostingClassifier(**params)
    cv_scores = cross_val_score(model, X_val_meta, y_val, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.mean()

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=100, n_jobs=-1)

# Get the best hyperparameters from the study
best_params = study.best_params
print(f"Best hyperparameters for meta-model: {best_params}")

# Train the final meta-model with the best hyperparameters
final_meta_model = GradientBoostingClassifier(**best_params)
final_meta_model.fit(X_val_meta, y_val)

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_meta_model, X_val_meta, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the meta-model on the test set
y_pred_test = final_meta_model.predict(X_test_meta)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy}")

# Print final results
print(f"Best Parameters for Meta-Model: {best_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")

[LightGBM] [Info] Number of positive: 329, number of negative: 334
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3503
[LightGBM] [Info] Number of data points in the train set: 663, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496229 -> initscore=-0.015083
[LightGBM] [Info] Start training from score -0.015083


[I 2024-06-22 13:46:46,774] A new study created in memory with name: no-name-600099d9-4a6e-49c0-bf3f-6caf4e27af03
[I 2024-06-22 13:46:49,643] Trial 0 finished with value: 0.8593596059113301 and parameters: {'learning_rate': 0.052892365923112986, 'n_estimators': 112, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 13, 'max_features': 0.655036768928282, 'subsample': 0.6157874159191865}. Best is trial 0 with value: 0.8593596059113301.
[I 2024-06-22 13:46:49,722] Trial 1 finished with value: 0.8527093596059114 and parameters: {'learning_rate': 0.034649237717983235, 'n_estimators': 172, 'max_depth': 3, 'min_samples_split': 18, 'min_samples_leaf': 3, 'max_features': 0.7722099886799071, 'subsample': 0.8649414211533175}. Best is trial 0 with value: 0.8593596059113301.
[I 2024-06-22 13:46:52,578] Trial 2 finished with value: 0.8527093596059114 and parameters: {'learning_rate': 0.01907461976400857, 'n_estimators': 357, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 11, '

KeyboardInterrupt: 

In [None]:
# catboost for meta
# Best Parameters for Meta-Model: {'depth': 10, 'iterations': 170, 'l2_leaf_reg': 8, 'learning_rate': 0.19754009744506684, 'subsample': 0.693889750789011, 'bagging_temperature': 0.5974660864197161, 'border_count': 72, 'one_hot_max_size': 16}
# CV Scores - Avg: 0.8381773399014778, Min: 0.7586206896551724, Max: 0.896551724137931
# Test Accuracy: 0.8671328671328671

# xgboost for meta
# Best Parameters for Meta-Model: {'max_depth': 9, 'n_estimators': 165, 'learning_rate': 0.21323373869865603, 'subsample': 0.8856962475019171, 'colsample_bytree': 0.6529010969767698, 'gamma': 8.739047749346323, 'min_child_weight': 5, 'reg_alpha': 6.982480429419232, 'reg_lambda': 2.141380812210051}
# CV Scores - Avg: 0.845320197044335, Min: 0.7586206896551724, Max: 0.896551724137931
# Test Accuracy: 0.8671328671328671

# lightgbm for meta
# Best Parameters for Meta-Model: {'max_depth': 9, 'num_leaves': 99, 'learning_rate': 0.02596036767057364, 'n_estimators': 364, 'subsample': 0.7214202982512115, 'colsample_bytree': 0.7404036334273929, 'reg_alpha': 4.7369115877798835, 'reg_lambda': 4.512847778182012, 'min_split_gain': 0.07197691915767544, 'min_child_weight': 9.019499870586479}
# CV Scores - Avg: 0.8238916256157636, Min: 0.7586206896551724, Max: 0.896551724137931
# Test Accuracy: 0.8671328671328671

# gbm for meta
# Best Parameters for Meta-Model: {'learning_rate': 0.018022156863544952, 'n_estimators': 227, 'max_depth': 10, 'min_samples_split': 13, 'min_samples_leaf': 9, 'max_features': 0.7353486667006071, 'subsample': 0.7140887887708872}
# CV Scores - Avg: 0.8238916256157636, Min: 0.7586206896551724, Max: 0.896551724137931
# Test Accuracy: 0.8671328671328671

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Assuming df_train, top_features_cumulative are defined appropriately

# Split the data into train, test, and validation sets with stratification
# X = df_train[top_features_cumulative]
# y = df_train['winner']
# Select numerical columns
numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

# Exclude the target column from numerical_cols if needed
if 'winner' in numerical_cols:
    numerical_cols = numerical_cols.drop('winner')

# Exclude additional columns if necessary
numerical_cols = numerical_cols.drop(['team1_id', 'team2_id', 'ground_id', 'match id'])

# Split the data into train, test, and validation sets with stratification
X = df_train[numerical_cols]
y = df_train['winner']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Define the individual models with their best parameters
catboost_params = {'depth': 4, 'iterations': 131, 'l2_leaf_reg': 9, 'learning_rate': 0.026746858922480903, 'subsample': 0.9806922046593372, 'bagging_temperature': 0.863851083829466, 'border_count': 79, 'one_hot_max_size': 10}
xgboost_params = {'max_depth': 5, 'n_estimators': 269, 'learning_rate': 0.1497059900832021, 'subsample': 0.6134373870090926, 'colsample_bytree': 0.8655799077621021, 'gamma': 3.39838763588673, 'min_child_weight': 10, 'reg_alpha': 5.895160858400561, 'reg_lambda': 2.0436192434364866}
lightgbm_params = {'max_depth': 3, 'num_leaves': 165, 'learning_rate': 0.15387125194824516, 'subsample': 0.7701573425754059, 'colsample_bytree': 0.8740741726764223, 'reg_alpha': 5.967689395391898, 'reg_lambda': 5.552505199267917, 'n_estimators': 294}
gbm_params = {'learning_rate': 0.013177244157007226, 'n_estimators': 223, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.677756768922895}

cat_model = CatBoostClassifier(**catboost_params, verbose=False)
xgb_model = XGBClassifier(**xgboost_params)
lgb_model = LGBMClassifier(**lightgbm_params)
gbm_model = GradientBoostingClassifier(**gbm_params)

# Train the individual models on the training set
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)

# Get the predictions on the validation set for the meta-model
X_val_meta = pd.DataFrame({
    'cat_pred': cat_model.predict(X_val),
    'xgb_pred': xgb_model.predict(X_val),
    'lgb_pred': lgb_model.predict(X_val),
    'gbm_pred': gbm_model.predict(X_val)
})

# Get the predictions on the test set for the meta-model
X_test_meta = pd.DataFrame({
    'cat_pred': cat_model.predict(X_test),
    'xgb_pred': xgb_model.predict(X_test),
    'lgb_pred': lgb_model.predict(X_test),
    'gbm_pred': gbm_model.predict(X_test)
})

# Best Parameters for Meta-Model
meta_model_params = {
    'max_depth': 9,
    'n_estimators': 165,
    'learning_rate': 0.21323373869865603,
    'subsample': 0.8856962475019171,
    'colsample_bytree': 0.6529010969767698,
    'gamma': 8.739047749346323,
    'min_child_weight': 5,
    'reg_alpha': 6.982480429419232,
    'reg_lambda': 2.141380812210051
}

# Train the final meta-model with the best hyperparameters
final_meta_model = XGBClassifier(**meta_model_params)
final_meta_model.fit(X_val_meta, y_val)

# Perform cross-validation on the validation set
cv_scores = cross_val_score(final_meta_model, X_val_meta, y_val, cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

# Evaluate the meta-model on the test set
y_pred_test = final_meta_model.predict(X_test_meta)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Print final results
print(f"Best Parameters for Meta-Model: {meta_model_params}")
print(f"CV Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Confusion Matrix:\n{conf_matrix}")

[LightGBM] [Info] Number of positive: 282, number of negative: 286
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19615
[LightGBM] [Info] Number of data points in the train set: 568, number of used features: 178
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496479 -> initscore=-0.014085
[LightGBM] [Info] Start training from score -0.014085
CV Scores - Avg: 0.8789473684210527, Min: 0.8157894736842105, Max: 0.9210526315789473
Best Parameters for Meta-Model: {'max_depth': 9, 'n_estimators': 165, 'learning_rate': 0.21323373869865603, 'subsample': 0.8856962475019171, 'colsample_bytree': 0.6529010969767698, 'gamma': 8.739047749346323, 'min_child_weight': 5, 'reg_alpha': 6.982480429419232, 'reg_lambda': 2.141380812210051}
CV Scores - Avg: 0.8789473684210527, Min: 0.8157894736842105, Max: 0.9210526315789473
Test Accuracy: 0.9
Test F1 Score: 0.89839

In [None]:
# CV Scores - Avg: 0.8596059113300493, Min: 0.7931034482758621, Max: 0.9285714285714286
# Test Accuracy: 0.8671328671328671

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Assuming df_train, top_features_cumulative are defined appropriately

# Split the data into train and test sets with stratification
X = df_train[top_features_cumulative]
y = df_train['winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the individual models with their best parameters
catboost_params = {'depth': 4, 'iterations': 131, 'l2_leaf_reg': 9, 'learning_rate': 0.026746858922480903, 'subsample': 0.9806922046593372, 'bagging_temperature': 0.863851083829466, 'border_count': 79, 'one_hot_max_size': 10}
xgboost_params = {'max_depth': 5, 'n_estimators': 269, 'learning_rate': 0.1497059900832021, 'subsample': 0.6134373870090926, 'colsample_bytree': 0.8655799077621021, 'gamma': 3.39838763588673, 'min_child_weight': 10, 'reg_alpha': 5.895160858400561, 'reg_lambda': 2.0436192434364866}
lightgbm_params = {'max_depth': 3, 'num_leaves': 165, 'learning_rate': 0.15387125194824516, 'subsample': 0.7701573425754059, 'colsample_bytree': 0.8740741726764223, 'reg_alpha': 5.967689395391898, 'reg_lambda': 5.552505199267917, 'n_estimators': 294}
gbm_params = {'learning_rate': 0.013177244157007226, 'n_estimators': 223, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.677756768922895}

cat_model = CatBoostClassifier(**catboost_params, verbose=False)
xgb_model = XGBClassifier(**xgboost_params)
lgb_model = LGBMClassifier(**lightgbm_params)
gbm_model = GradientBoostingClassifier(**gbm_params)

# Train the individual models on the training set
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
gbm_model.fit(X_train, y_train)

# Get the predictions on the test set for the meta-model
X_test_meta = pd.DataFrame({
    'cat_pred': cat_model.predict(X_test),
    'xgb_pred': xgb_model.predict(X_test),
    'lgb_pred': lgb_model.predict(X_test),
    'gbm_pred': gbm_model.predict(X_test)
})

# Best Parameters for Meta-Model
meta_model_params = {
    'max_depth': 9,
    'n_estimators': 165,
    'learning_rate': 0.21323373869865603,
    'subsample': 0.8856962475019171,
    'colsample_bytree': 0.6529010969767698,
    'gamma': 8.739047749346323,
    'min_child_weight': 5,
    'reg_alpha': 6.982480429419232,
    'reg_lambda': 2.141380812210051
}

# Train the final meta-model with the best hyperparameters
final_meta_model = XGBClassifier(**meta_model_params)
final_meta_model.fit(X_test_meta, y_test)

# Evaluate the meta-model on the test set
y_pred_test = final_meta_model.predict(X_test_meta)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Print final results
print(f"Best Parameters for Meta-Model: {meta_model_params}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Confusion Matrix:\n{conf_matrix}")

[LightGBM] [Info] Number of positive: 329, number of negative: 334
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14682
[LightGBM] [Info] Number of data points in the train set: 663, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496229 -> initscore=-0.015083
[LightGBM] [Info] Start training from score -0.015083
Best Parameters for Meta-Model: {'max_depth': 9, 'n_estimators': 165, 'learning_rate': 0.21323373869865603, 'subsample': 0.8856962475019171, 'colsample_bytree': 0.6529010969767698, 'gamma': 8.739047749346323, 'min_child_weight': 5, 'reg_alpha': 6.982480429419232, 'reg_lambda': 2.141380812210051}
Test Accuracy: 0.5017543859649123
Test F1 Score: 0.0
Test Precision: 0.0
Test Recall: 0.0
Confusion Matrix:
[[143   0]
 [142   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
df_train['pred_winner'] = final_meta_model.predict(pd.DataFrame({
    'cat_pred': cat_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'xgb_pred': xgb_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'lgb_pred': lgb_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'gbm_pred': gbm_model.predict_proba(df_train[top_features_cumulative])[:, 1]
}))

df_train['pred_winner_score'] = final_meta_model.predict_proba(pd.DataFrame({
    'cat_pred': cat_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'xgb_pred': xgb_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'lgb_pred': lgb_model.predict_proba(df_train[top_features_cumulative])[:, 1],
    'gbm_pred': gbm_model.predict_proba(df_train[top_features_cumulative])[:, 1]
}))[:, 1]

df_train['pred_winner_id'] = df_train.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

print(df_train.shape)

(948, 194)


In [None]:
test_data['pred_winner'] = final_meta_model.predict(pd.DataFrame({
    'cat_pred': cat_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'xgb_pred': xgb_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'lgb_pred': lgb_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'gbm_pred': gbm_model.predict_proba(test_data[top_features_cumulative])[:, 1]
}))

test_data['pred_winner_score'] = final_meta_model.predict_proba(pd.DataFrame({
    'cat_pred': cat_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'xgb_pred': xgb_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'lgb_pred': lgb_model.predict_proba(test_data[top_features_cumulative])[:, 1],
    'gbm_pred': gbm_model.predict_proba(test_data[top_features_cumulative])[:, 1]
}))[:, 1]

test_data['pred_winner_id'] = test_data.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

print(test_data.shape)

(271, 193)


In [None]:
# catboost_params = {'depth': 4, 'iterations': 131, 'l2_leaf_reg': 9, 'learning_rate': 0.026746858922480903, 'subsample': 0.9806922046593372, 'bagging_temperature': 0.863851083829466, 'border_count': 79, 'one_hot_max_size': 10}
# xgboost_params = {'max_depth': 5, 'n_estimators': 269, 'learning_rate': 0.1497059900832021, 'subsample': 0.6134373870090926, 'colsample_bytree': 0.8655799077621021, 'gamma': 3.39838763588673, 'min_child_weight': 10, 'reg_alpha': 5.895160858400561, 'reg_lambda': 2.0436192434364866}
# lightgbm_params = {'max_depth': 3, 'num_leaves': 165, 'learning_rate': 0.15387125194824516, 'subsample': 0.7701573425754059, 'colsample_bytree': 0.8740741726764223, 'reg_alpha': 5.967689395391898, 'reg_lambda': 5.552505199267917, 'n_estimators': 294}
# gbm_params = {'learning_rate': 0.013177244157007226, 'n_estimators': 223, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 0.677756768922895}
# Best Parameters for Meta-Model: {'max_depth': 9, 'n_estimators': 165, 'learning_rate': 0.21323373869865603, 'subsample': 0.8856962475019171, 'colsample_bytree': 0.6529010969767698, 'gamma': 8.739047749346323, 'min_child_weight': 5, 'reg_alpha': 6.982480429419232, 'reg_lambda': 2.141380812210051}


In [None]:
df_train['dataset_type'] = 'train'
test_data['dataset_type'] = 'r1'
algo_name = 'CatBoost;XGBoost;LightGBM;GBM;CatBoost'
is_ensemble = 'yes'
n_trees = '131;269;294;223;165'
depth = '4;5;3;3;9'
lr = '0.026746858922480903;0.1497059900832021;0.15387125194824516;0.013177244157007226;0.21323373869865603'

In [None]:
# Get feature importances from each model
ft_imp_cat = pd.DataFrame({'Feature': top_features_cumulative, 'Importance': cat_model.get_feature_importance()})
ft_imp_xgb = pd.DataFrame({'Feature': top_features_cumulative, 'Importance': xgb_model.feature_importances_})
ft_imp_lgbm = pd.DataFrame({'Feature': top_features_cumulative, 'Importance': lgb_model.feature_importances_})
ft_imp_gbm = pd.DataFrame({'Feature': top_features_cumulative, 'Importance': gbm_model.feature_importances_})

# Concatenate all feature importance DataFrames
all_importances = pd.concat([ft_imp_cat, ft_imp_xgb, ft_imp_lgbm, ft_imp_gbm])

# Calculate average importance across models
df_avg_importance = all_importances.groupby('Feature')['Importance'].mean().reset_index()

# Sort by average importance in descending order and select top 10 features
df_avg_importance = df_avg_importance.sort_values(by='Importance', ascending=False).reset_index(drop=True).head(10)

# Rename columns as per requirement
df_feat_importance = df_avg_importance.rename(columns={'Feature': 'feat_name', 'Importance': 'model_feat_imp_train'}).head(10)

# Display the final DataFrame with top 10 features and their average importance
# print(ft_imp_cat)
# print(ft_imp_lgbm)
print(df_feat_importance)

                                           feat_name  model_feat_imp_train
0  team1_day/night match_runs_avg - team2_day/nig...              6.203185
1                           victory_by_wickets_team2              4.759981
2                              nrr_team1 - nrr_team2              4.621072
3                             inning1_avg_runs_team1              4.463710
4                          inning1_avg_wickets_team1              4.417557
5                       Sum_of_BowlerAbilities_team1              4.319368
6                          inning2_avg_wickets_team2              4.268900
7                           team_count_50runs_last15              3.877238
8                       avg_inning2_runs_venue_last5              3.806339
9                      Sum_of_BatsmanAbilities_team2              3.805220


In [None]:
df_file1 = pd.concat([test_data[['match id','dataset_type','pred_winner_id','pred_winner_score',] + list(df_feat_importance['feat_name'].head(10))], \
                     df_train[['match id','dataset_type','pred_winner_id','pred_winner_score',] + list(df_feat_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

df_file1['train_algorithm'] = algo_name
df_file1['is_ensemble'] = is_ensemble
df_file1['train_hps_trees'] = n_trees
df_file1['train_hps_depth'] = depth
df_file1['train_hps_lr'] = lr

In [None]:
df_file1['pred_winner_id'] = df_file1['pred_winner_id'].astype('int64')
print(df_file1.shape)
df_file1.head()

(1219, 19)


Unnamed: 0,match id,dataset_type,pred_winner_id,pred_winner_score,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr
0,9250275,r1,90,0.499036,0.0,0.0,0.358514,0.0,0.0,0.0,6.0,0.0,0.0,554.272445,CatBoost;XGBoost;LightGBM;GBM;CatBoost,yes,131;269;294;223;165,4;5;3;3;9,0.026746858922480903;0.1497059900832021;0.1538...
1,9262189,r1,36098,0.499036,0.0,5.0,0.336467,154.5,7.666667,245.860811,6.777778,0.615385,150.0,718.410559,CatBoost;XGBoost;LightGBM;GBM;CatBoost,yes,131;269;294;223;165,4;5;3;3;9,0.026746858922480903;0.1497059900832021;0.1538...
2,9128776,r1,48334,0.499036,-0.75,6.0,0.219583,173.142857,6.111111,220.499792,8.6,0.842105,161.2,713.790583,CatBoost;XGBoost;LightGBM;GBM;CatBoost,yes,131;269;294;223;165,4;5;3;3;9,0.026746858922480903;0.1497059900832021;0.1538...
3,9586919,r1,36112,0.499036,0.0,4.0,0.437987,179.454545,6.0,219.782624,7.0,0.285714,142.6,701.656972,CatBoost;XGBoost;LightGBM;GBM;CatBoost,yes,131;269;294;223;165,4;5;3;3;9,0.026746858922480903;0.1497059900832021;0.1538...
4,9128538,r1,48341,0.499036,86.25,0.0,0.123146,180.214286,6.142857,204.304778,7.0,2.375,155.0,675.124302,CatBoost;XGBoost;LightGBM;GBM;CatBoost,yes,131;269;294;223;165,4;5;3;3;9,0.026746858922480903;0.1497059900832021;0.1538...


In [None]:
print(df_feat_importance)

                                           feat_name  model_feat_imp_train
0  team1_day/night match_runs_avg - team2_day/nig...              6.203185
1                           victory_by_wickets_team2              4.759981
2                              nrr_team1 - nrr_team2              4.621072
3                             inning1_avg_runs_team1              4.463710
4                          inning1_avg_wickets_team1              4.417557
5                       Sum_of_BowlerAbilities_team1              4.319368
6                          inning2_avg_wickets_team2              4.268900
7                           team_count_50runs_last15              3.877238
8                       avg_inning2_runs_venue_last5              3.806339
9                      Sum_of_BatsmanAbilities_team2              3.805220


In [None]:
feature_desc = {
    'team1_day/night match_runs_avg - team2_day/night match_runs_avg': 'Difference between the average runs scored by team1 and team2 in day/night matches',
    'victory_by_wickets_team2': 'Number of matches won by team2 by wickets',
    'nrr_team1 - nrr_team2': 'Difference in net run rate (NRR) between team1 and team2',
    'inning1_avg_runs_team1': 'Average runs scored by team1 in the first innings',
    'inning1_avg_wickets_team1': 'Average wickets lost by team1 in the first innings',
    'Sum_of_BowlerAbilities_team1': 'Sum of bowler abilities for team1',
    'inning2_avg_wickets_team2': 'Average wickets lost by team2 in the second innings',
    'team_count_50runs_last15': 'Number of 50+ runs scored by the team in the last 15 matches',
    'avg_inning2_runs_venue_last5': 'Average runs scored in the second innings at the venue in the last 5 matches',
    'Sum_of_BatsmanAbilities_team2': 'Sum of batsman abilities for team2'
}

In [None]:
# df_feat_importance.rename(index={0:'feat_id'}, inplace=True)
df_file2 = df_feat_importance
df_file2['feat_id'] = [i+1 for i in df_file2.index]
df_file2['feat_rank_train'] = [i+1 for i in df_file2.index]
df_file2 = df_file2.set_index('feat_id')
df_file2['feat_description'] = df_file2['feat_name'].map(feature_desc)
df_file2

Unnamed: 0_level_0,feat_name,model_feat_imp_train,feat_rank_train,feat_description
feat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,team1_day/night match_runs_avg - team2_day/nig...,6.203185,1,Difference between the average runs scored by ...
2,victory_by_wickets_team2,4.759981,2,Number of matches won by team2 by wickets
3,nrr_team1 - nrr_team2,4.621072,3,Difference in net run rate (NRR) between team1...
4,inning1_avg_runs_team1,4.46371,4,Average runs scored by team1 in the first innings
5,inning1_avg_wickets_team1,4.417557,5,Average wickets lost by team1 in the first inn...
6,Sum_of_BowlerAbilities_team1,4.319368,6,Sum of bowler abilities for team1
7,inning2_avg_wickets_team2,4.2689,7,Average wickets lost by team2 in the second in...
8,team_count_50runs_last15,3.877238,8,Number of 50+ runs scored by the team in the l...
9,avg_inning2_runs_venue_last5,3.806339,9,Average runs scored in the second innings at t...
10,Sum_of_BatsmanAbilities_team2,3.80522,10,Sum of batsman abilities for team2


In [None]:
df_file1.rename(columns={'pred_winner_id': 'win_pred_team_id'}, inplace=True)
df_file1.rename(columns={'pred_winner_score': 'win_pred_score'}, inplace=True)

In [None]:
df_file1.to_csv('sub17 file1.csv', index=False)
df_file2.to_csv('sub17 file2.csv', index=False)

In [None]:
df_train = pd.read_csv('/content/processed_train_f (2).csv')
test_data = pd.read_csv('/content/processed_test_f (2).csv')

In [None]:
print(df_train.shape, test_data.shape)

(948, 197) (271, 196)


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim

# Load the data
file_path = "/content/processed_train_f (2).csv"
data = pd.read_csv(file_path)

# Dropping irrelevant columns
irrelevant_columns = [
    'match id', 'team1', 'team1_id', 'team1_roster_ids',
    'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city',
    'match_dt', 'series_name', 'season'
]
data_cleaned = data.drop(columns=irrelevant_columns)

# Select only numeric columns
data_numeric = data_cleaned.select_dtypes(include=['number'])

# Fill missing values with the mean of their respective columns
data_filled = data_numeric.fillna(data_numeric.mean())

# Split the data into features and target variable
X = data_filled.drop(columns=['winner'])
y = data_filled['winner']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features using the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(X.shape[1], 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 2)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        embeddings = x  # Extract embeddings here
        x = self.fc3(x)
        return x, embeddings

model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs, _ = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Extract embeddings from the trained model
model.eval()
with torch.no_grad():
    _, train_embeddings = model(X_train_tensor)
    test_outputs, test_embeddings = model(X_test_tensor)

# Convert embeddings and outputs to numpy
train_embeddings = train_embeddings.numpy()
test_embeddings = test_embeddings.numpy()
test_outputs = test_outputs.numpy()

# Compute ROC AUC score on test set
y_test_pred_proba = torch.softmax(torch.tensor(test_outputs), dim=1).numpy()[:, 1]
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"ROC AUC score on test set: {roc_auc}")




Epoch 1/500, Loss: 0.6967885494232178
Epoch 2/500, Loss: 0.683638334274292
Epoch 3/500, Loss: 0.6717560887336731
Epoch 4/500, Loss: 0.6605300903320312
Epoch 5/500, Loss: 0.6497259140014648
Epoch 6/500, Loss: 0.638935923576355
Epoch 7/500, Loss: 0.6280350685119629
Epoch 8/500, Loss: 0.6169381737709045
Epoch 9/500, Loss: 0.6055351495742798
Epoch 10/500, Loss: 0.5938081741333008
Epoch 11/500, Loss: 0.5815461874008179
Epoch 12/500, Loss: 0.5687105059623718
Epoch 13/500, Loss: 0.5552439093589783
Epoch 14/500, Loss: 0.5411531329154968
Epoch 15/500, Loss: 0.526512622833252
Epoch 16/500, Loss: 0.5113442540168762
Epoch 17/500, Loss: 0.495613694190979
Epoch 18/500, Loss: 0.4793899953365326
Epoch 19/500, Loss: 0.4627162218093872
Epoch 20/500, Loss: 0.4455198049545288
Epoch 21/500, Loss: 0.42787015438079834
Epoch 22/500, Loss: 0.40994885563850403
Epoch 23/500, Loss: 0.3918302059173584
Epoch 24/500, Loss: 0.37365472316741943
Epoch 25/500, Loss: 0.355507493019104
Epoch 26/500, Loss: 0.33745467662811

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.losses import mse
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import pandas as pd
from catboost import CatBoostClassifier

# Load your data
# Example: X, y = ...

# Split data into training and testing sets
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the VAE architecture
input_dim = X_train.shape[1]
latent_dim = 42  # Dimension of the latent space

# Encoder
inputs = Input(shape=(input_dim,))
h = Dense(64, activation='relu')(inputs)
h = Dense(32, activation='relu')(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# Decoder
decoder_h = Dense(32, activation='relu')
decoder_h2 = Dense(64, activation='relu')
decoder_mean = Dense(input_dim, activation='sigmoid')
h_decoded = decoder_h(z)
h_decoded = decoder_h2(h_decoded)
x_decoded_mean = decoder_mean(h_decoded)

# VAE model
vae = Model(inputs, x_decoded_mean)

# VAE loss
reconstruction_loss = mse(inputs, x_decoded_mean)
reconstruction_loss *= input_dim
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the VAE
vae.fit(X_train, X_train, epochs=500, batch_size=16, validation_split=0.2, shuffle=True, callbacks=[early_stopping])

# Encoder model to get latent features
encoder = Model(inputs, z)

# Get the latent representations
train_embeddings = encoder.predict(X_train)
test_embeddings = encoder.predict(X_test)

# Convert embeddings to DataFrames
train_embeddings_df = pd.DataFrame(train_embeddings, columns=[f"emb_{i}" for i in range(train_embeddings.shape[1])])
test_embeddings_df = pd.DataFrame(test_embeddings, columns=[f"emb_{i}" for i in range(test_embeddings.shape[1])])

# Define parameter grid for RandomizedSearchCV for CatBoost
param_grid = {
    'iterations': sp_randint(100, 500),  # reduce range of number of trees
    'learning_rate': sp_uniform(0.01, 0.1),  # reduce range of learning rate
    'depth': sp_randint(4, 10),  # reduce range of tree depth
    'l2_leaf_reg': sp_uniform(1, 10),  # reduce range of L2 regularization coefficient
}

# Initialize CatBoostClassifier
model = CatBoostClassifier(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,  # Increase number of iterations
    scoring='accuracy',  # evaluation metric
    cv=5,  # cross-validation folds
    verbose=3,  # verbosity
    random_state=42,  # random state for reproducibility
    n_jobs=-1  # use all available CPU cores
)

# Fit RandomizedSearchCV on training embeddings
random_search.fit(train_embeddings_df, y_train)

# Print best parameters and best score
print("Best Parameters found: ", random_search.best_params_)
print("Best CV Score: ", random_search.best_score_)

# Predict using the best estimator found by RandomizedSearchCV
best_model = random_search.best_estimator_
predictions = best_model.predict(test_embeddings_df)

# Evaluate performance on test set
accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy with latent features from VAE and CatBoost: ", accuracy)


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  pid = os.fork()


0:	learn: 0.6722419	total: 143ms	remaining: 44.7s
1:	learn: 0.6574567	total: 249ms	remaining: 38.9s
2:	learn: 0.6374073	total: 378ms	remaining: 39.2s
3:	learn: 0.6195285	total: 504ms	remaining: 39s
4:	learn: 0.6078932	total: 605ms	remaining: 37.4s
5:	learn: 0.5914764	total: 710ms	remaining: 36.4s
6:	learn: 0.5733615	total: 834ms	remaining: 36.6s
7:	learn: 0.5595896	total: 955ms	remaining: 36.5s
8:	learn: 0.5437916	total: 1.07s	remaining: 36.2s
9:	learn: 0.5251252	total: 1.19s	remaining: 36s
10:	learn: 0.5092472	total: 1.3s	remaining: 35.9s
11:	learn: 0.4953226	total: 1.43s	remaining: 35.9s
12:	learn: 0.4879105	total: 1.55s	remaining: 35.8s
13:	learn: 0.4744490	total: 1.67s	remaining: 35.7s
14:	learn: 0.4673250	total: 1.79s	remaining: 35.6s
15:	learn: 0.4570514	total: 1.9s	remaining: 35.4s
16:	learn: 0.4494470	total: 2.03s	remaining: 35.5s
17:	learn: 0.4416987	total: 2.14s	remaining: 35.2s
18:	learn: 0.4315549	total: 2.27s	remaining: 35.2s
19:	learn: 0.4238394	total: 2.39s	remaining: 35

In [None]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_grid = {
    'iterations': sp_randint(100, 1000),
    'learning_rate': sp_uniform(0.01, 0.3),
    'depth': sp_randint(4, 10),
    'l2_leaf_reg': sp_uniform(1, 10),
    'border_count': sp_randint(32, 255),
}

model = CatBoostClassifier()

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(train_embeddings_df, y_train)

print("Best Parameters found: ", random_search.best_params_)
print("Best CV Score: ", random_search.best_score_)

best_model = random_search.best_estimator_
predictions = best_model.predict(test_embeddings_df)
accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy with best model: ", accuracy)


  pid = os.fork()


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  pid = os.fork()


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, roc_auc_score

# Load the data
file_path = "/content/processed_train_f (2).csv"
data = pd.read_csv(file_path)

# Dropping irrelevant columns
irrelevant_columns = [
    'match id', 'team1', 'team1_id', 'team1_roster_ids',
    'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city',
    'match_dt', 'series_name', 'season'
]
data_cleaned = data.drop(columns=irrelevant_columns)

# Select only numeric columns
data_numeric = data_cleaned.select_dtypes(include=['number'])

# Fill missing values with the mean of their respective columns
data_filled = data_numeric.fillna(data_numeric.mean())

# Split the data into features and target variable
X = data_filled.drop(columns=['winner'])
y = data_filled['winner']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a neural network with an embedding layer
input_dim = X_train_scaled.shape[1]
embedding_dim = 45

input_layer = Input(shape=(input_dim,))
x = Dense(64, activation='relu', kernel_regularizer='l2')(input_layer)
x = Dropout(0.5)(x)
x = Dense(embedding_dim, activation='relu', kernel_regularizer='l2')(x)  # Embedding layer
x = Dropout(0.5)(x)
output_layer = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping], verbose=1)

# Evaluate the model on the test set
y_pred = model.predict(X_test_scaled)
print(f"Test AUC score: {roc_auc_score(y_test, y_pred)}")
print(classification_report(y_test, (y_pred > 0.5).astype(int)))

# Define a new model to extract embeddings
embedding_model = Model(inputs=model.input, outputs=model.layers[-3].output)

# Extract embeddings for the train and test sets
train_embeddings = embedding_model.predict(X_train_scaled)
test_embeddings = embedding_model.predict(X_test_scaled)


# Now train_embeddings and test_embeddings can be used for further tasks


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Test AUC score: 0.9175878405053296
              precision    recall  f1-score   support

         0.0       0.85      0.79      0.82       136
         1.0       0.82      0.87      0.84       149

    accuracy                           0.83       285
   macro avg       0.83      0.83      0.83       285
weighted avg       0.83      0.83      0.83       285



In [None]:
!pip install keras-tuner --upgrade


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.metrics import classification_report, roc_auc_score
import keras_tuner as kt

# Load the data
file_path = "/content/processed_train_f (2).csv"
data = pd.read_csv(file_path)

# Dropping irrelevant columns
irrelevant_columns = [
    'match id', 'team1', 'team1_id', 'team1_roster_ids',
    'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city',
    'match_dt', 'series_name', 'season'
]
data_cleaned = data.drop(columns=irrelevant_columns)

# Select only numeric columns
data_numeric = data_cleaned.select_dtypes(include=['number'])

# Fill missing values with the mean of their respective columns
data_filled = data_numeric.fillna(data_numeric.mean())

# Split the data into features and target variable
X = data_filled.drop(columns=['winner'])
y = data_filled['winner']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning function
def build_model(hp):
    input_dim = X_train_scaled.shape[1]
    embedding_dim = 45

    input_layer = Input(shape=(input_dim,))
    x = Dense(embedding_dim, activation='relu', kernel_regularizer='l2')(input_layer)
    x = Dropout(hp.Float('dropout', 0.2, 0.5, step=0.1, default=0.3))(x)
    output_layer = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(
        optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='hyperparameter_tuning'
)

# Perform hyperparameter tuning
tuner.search(X_train_scaled, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=1)

# Get the best model architecture and hyperparameters
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.hypermodel.build(best_hp)

# Train the best model without early stopping
best_model.fit(X_train_scaled, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test_scaled).ravel()
print(f"Test AUC score: {roc_auc_score(y_test, y_pred)}")
print(classification_report(y_test, (y_pred > 0.5).astype(int)))

# Define a new model to extract embeddings
embedding_model = Model(inputs=best_model.input, outputs=best_model.layers[-2].output)

# Extract embeddings for the train and test sets
train_embeddings = embedding_model.predict(X_train_scaled)
test_embeddings = embedding_model.predict(X_test_scaled)


Reloading Tuner from my_dir/hyperparameter_tuning/tuner0.json
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/10

In [None]:
data_test=pd.read_csv("/content/processed_test_f (2).csv")

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [None]:
!pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m92.2/129.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# Assuming you have your embeddings already defined as train_embeddings and test_embeddings
# and your target labels as y_train and y_test

# Example data (replace with your actual data)
# train_embeddings = ...
# test_embeddings = ...
# y_train = ...
# y_test = ...

# Convert embeddings to DataFrames
train_embeddings_df = pd.DataFrame(train_embeddings, columns=[f"emb_{i}" for i in range(train_embeddings.shape[1])])
test_embeddings_df = pd.DataFrame(test_embeddings, columns=[f"emb_{i}" for i in range(test_embeddings.shape[1])])

# Define parameter grid for RandomizedSearchCV with reduced ranges
param_grid = {
    'iterations': sp_randint(100, 500),  # reduce range of number of trees
    'learning_rate': sp_uniform(0.01, 0.1),  # reduce range of learning rate
    'depth': sp_randint(3, 7),  # reduce range of tree depth
    'l2_leaf_reg': sp_uniform(1, 5),  # reduce range of L2 regularization coefficient
    'border_count': sp_randint(32, 128),  # reduce range of number of splits
}

# Initialize CatBoostClassifier
model = CatBoostClassifier()

# Setup RandomizedSearchCV with adjusted parameters
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=10,  # Reduce number of iterations
    scoring='accuracy',  # evaluation metric
    cv=5,  # cross-validation folds
    verbose=3,  # verbosity
    random_state=42,  # random state for reproducibility
    n_jobs=-1  # use all available CPU cores
)

# Fit RandomizedSearchCV on training embeddings
random_search.fit(train_embeddings_df, y_train)

# Print best parameters and best score
print("Best Parameters found: ", random_search.best_params_)
print("Best CV Score: ", random_search.best_score_)

# Predict using the best estimator found by RandomizedSearchCV
best_model = random_search.best_estimator_
predictions = best_model.predict(test_embeddings_df)

# Evaluate performance on test set if y_test is available
accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy with best model: ", accuracy)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  pid = os.fork()


0:	learn: 0.6294660	total: 2.4ms	remaining: 551ms
1:	learn: 0.5604569	total: 4.38ms	remaining: 499ms
2:	learn: 0.5004323	total: 6.11ms	remaining: 463ms
3:	learn: 0.4531936	total: 7.78ms	remaining: 439ms
4:	learn: 0.4158572	total: 9.44ms	remaining: 425ms
5:	learn: 0.3834883	total: 11.2ms	remaining: 419ms
6:	learn: 0.3504011	total: 12.8ms	remaining: 408ms
7:	learn: 0.3347190	total: 14.4ms	remaining: 401ms
8:	learn: 0.3084265	total: 16.2ms	remaining: 397ms
9:	learn: 0.2879910	total: 17.8ms	remaining: 392ms
10:	learn: 0.2698159	total: 19.4ms	remaining: 385ms
11:	learn: 0.2543472	total: 21ms	remaining: 381ms
12:	learn: 0.2417293	total: 22.7ms	remaining: 379ms
13:	learn: 0.2319396	total: 24.4ms	remaining: 377ms
14:	learn: 0.2209422	total: 26ms	remaining: 372ms
15:	learn: 0.2119370	total: 27.7ms	remaining: 370ms
16:	learn: 0.2048887	total: 29.3ms	remaining: 367ms
17:	learn: 0.1962780	total: 31ms	remaining: 365ms
18:	learn: 0.1881716	total: 32.6ms	remaining: 362ms
19:	learn: 0.1829362	total: 3

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Assuming you have your embeddings already defined as train_embeddings and test_embeddings
# and your target labels as y_train and y_test

# Example data (replace with your actual data)
# train_embeddings = ...
# test_embeddings = ...
# y_train = ...
# y_test = ...

# Convert embeddings to DataFrames
train_embeddings_df = pd.DataFrame(train_embeddings, columns=[f"emb_{i}" for i in range(train_embeddings.shape[1])])
test_embeddings_df = pd.DataFrame(test_embeddings, columns=[f"emb_{i}" for i in range(test_embeddings.shape[1])])

# Define base models
base_models = [
    ('catboost', CatBoostClassifier(random_state=42)),
    ('xgboost', XGBClassifier(random_state=42)),
    ('lightgbm', LGBMClassifier(random_state=42))
]

# Meta-model
meta_model = CatBoostClassifier(random_state=42)

# Initialize predictions DataFrame for train and test
train_predictions = pd.DataFrame()
test_predictions = pd.DataFrame()

# Initialize KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Generate predictions for each base model
for name, model in base_models:
    print(f"Training and predicting with {name}...")
    train_preds = []
    test_preds = []
    for train_index, val_index in kf.split(train_embeddings_df):
        X_train_fold, X_val_fold = train_embeddings_df.iloc[train_index], train_embeddings_df.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)
        train_fold_pred = model.predict(X_val_fold)
        train_preds.extend(train_fold_pred)

        test_fold_pred = model.predict(test_embeddings_df)
        test_preds.append(test_fold_pred)

    train_predictions[name] = train_preds
    test_predictions[name] = sum(test_preds) / len(test_preds)  # Average predictions across folds

# Train meta-model on train predictions
meta_model.fit(train_predictions, y_train)

# Predict with meta-model on test predictions
meta_predictions = meta_model.predict(test_predictions)

# Evaluate performance on test set if y_test is available
accuracy = accuracy_score(y_test, meta_predictions)
print("Test Accuracy with blending classifier: ", accuracy)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
449:	learn: 0.0553363	total: 4.92s	remaining: 6.02s
450:	learn: 0.0552634	total: 4.93s	remaining: 6s
451:	learn: 0.0551025	total: 4.94s	remaining: 5.99s
452:	learn: 0.0549430	total: 4.95s	remaining: 5.98s
453:	learn: 0.0548785	total: 4.97s	remaining: 5.97s
454:	learn: 0.0547346	total: 4.98s	remaining: 5.96s
455:	learn: 0.0546247	total: 4.99s	remaining: 5.95s
456:	learn: 0.0545049	total: 5s	remaining: 5.94s
457:	learn: 0.0543898	total: 5.01s	remaining: 5.93s
458:	learn: 0.0542503	total: 5.02s	remaining: 5.92s
459:	learn: 0.0541391	total: 5.03s	remaining: 5.9s
460:	learn: 0.0539435	total: 5.04s	remaining: 5.89s
461:	learn: 0.0538040	total: 5.05s	remaining: 5.88s
462:	learn: 0.0537234	total: 5.06s	remaining: 5.87s
463:	learn: 0.0536478	total: 5.07s	remaining: 5.86s
464:	learn: 0.0535584	total: 5.08s	remaining: 5.85s
465:	learn: 0.0534268	total: 5.09s	remaining: 5.84s
466:	learn: 0.0533120	total: 5.11s	remaining: 5.83s
467:	l

In [None]:
embeddings_test = embedding_model.predict(X_log_transformed_test)



In [None]:
df_test=pd.read_csv("/content/processed_test_f (2).csv")

In [None]:
# Predict the winner and winner score using the CatBoost model
df_test['pred_winner'] = model.predict(embeddings_test)
df_test['pred_winner_score'] = model.predict_proba(embeddings_test)[:, 1]

# Determine the predicted winner ID
df_test['pred_winner_id'] = df_test.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

In [None]:
df_train.to_csv("train_embeddings.csv",index=False)
df_test.to_csv("test_embeddings.csv",index=False)

In [None]:
# Predict the winner and winner score using the CatBoost model
df_train['pred_winner'] = model.predict(X)
df_train['pred_winner_score'] = model.predict_proba(X)[:, 1]

# Determine the predicted winner ID
df_train['pred_winner_id'] = df_train.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

In [None]:
X=embeddings_df
y

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
943    1.0
944    0.0
945    0.0
946    0.0
947    0.0
Name: winner, Length: 948, dtype: float64

In [None]:
# Standardize the entire dataset
X_scaled = scaler.transform(X)

# Convert to PyTorch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

# Extract embeddings for the entire dataset
model.eval()
_, embeddings = model(X_tensor)
embeddings = embeddings.detach().numpy()

# Predict the winner and winner score using the CatBoost model
df_train['pred_winner'] = catboost_model.predict(embeddings)
df_train['pred_winner_score'] = catboost_model.predict_proba(embeddings)[:, 1]

# Determine the predicted winner ID
df_train['pred_winner_id'] = df_train.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

# Print the shape of the dataframe
print(df_train.shape)

(948, 200)


In [None]:
import pandas as pd
import torch

# Load the test data
# test_file_path = "/content/drive/MyDrive/AmEx/processed_test_f.csv"
# test_data = pd.read_csv(test_file_path)

# Dropping irrelevant columns
irrelevant_columns = [
    'match id', 'team1', 'team1_id', 'team1_roster_ids',
    'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city',
    'match_dt', 'series_name', 'season'
]
test_data_cleaned = test_data.drop(columns=irrelevant_columns)

# Select only numeric columns
test_data_numeric = test_data_cleaned.select_dtypes(include=['number'])

# Fill missing values with the mean of their respective columns
test_data_filled = test_data_numeric.fillna(test_data_numeric.mean())

# Standardize the features in the test data
scaler = StandardScaler()
test_data_scaled = scaler.fit_transform(test_data_filled)

# Convert to PyTorch tensor
test_data_tensor = torch.tensor(test_data_scaled, dtype=torch.float32)

# Extract embeddings for the test dataset
model.eval()
_, test_data_embeddings = model(test_data_tensor)
test_data_embeddings = test_data_embeddings.detach().numpy()

# Predict the winner and winner score using the CatBoost model
test_data['pred_winner'] = catboost_model.predict(test_data_embeddings)
test_data['pred_winner_score'] = catboost_model.predict_proba(test_data_embeddings)[:, 1]

# Determine the predicted winner ID
test_data['pred_winner_id'] = test_data.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)

# Print the shape of the test dataframe
print(test_data.shape)

(271, 199)


In [None]:
print(df_train.shape, test_data.shape)

(948, 200) (271, 199)


In [None]:
df_train['dataset_type'] = 'train'
test_data['dataset_type'] = 'r1'
algo_name = 'CatBoost;XGBoost;LightGBM;GBM;CatBoost'
is_ensemble = 'yes'
n_trees = '131;269;294;223;165'
depth = '4;5;3;3;9'
lr = '0.026746858922480903;0.1497059900832021;0.15387125194824516;0.013177244157007226;0.21323373869865603'

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Train XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(train_embeddings, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(test_embeddings)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

# Perform cross-validation on the test set
cv_scores_xgb = cross_val_score(xgb_model, test_embeddings, y_test, cv=5, scoring='accuracy', n_jobs=-1)

# Print the results
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f"XGBoost Classification Report:\n{report_xgb}")
print(f"Cross-Validation Scores - Avg: {cv_scores_xgb.mean()}, Min: {cv_scores_xgb.min()}, Max: {cv_scores_xgb.max()}")

XGBoost Accuracy: 0.8245614035087719
XGBoost Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.82      0.82       136
         1.0       0.84      0.83      0.83       149

    accuracy                           0.82       285
   macro avg       0.82      0.82      0.82       285
weighted avg       0.82      0.82      0.82       285

Cross-Validation Scores - Avg: 0.8035087719298245, Min: 0.7719298245614035, Max: 0.8421052631578947


In [None]:
df_train.to_csv('sub19 train.csv', index=False)
test_data.to_csv('sub19 test.csv', index=False)

In [None]:
# ROUND 2 SUBMISSION FILES

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Load the data
data = df_train

# Dropping irrelevant columns
irrelevant_columns = [
    'match id', 'team1', 'team1_id', 'team1_roster_ids',
    'team2', 'team2_id', 'team2_roster_ids', 'venue', 'city',
    'match_dt', 'series_name', 'season'
]
data_cleaned = data.drop(columns=irrelevant_columns)

# Select only numeric columns
data_numeric = data_cleaned.select_dtypes(include=['number'])

# Fill missing values with the mean of their respective columns
data_filled = data_numeric.fillna(data_numeric.mean())

# Split the data into features and target variable
X = data_filled.drop(columns=['winner'])
y = data_filled['winner']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(X.shape[1], 100)
        self.fc2 = nn.Linear(100, 35)
        self.fc3 = nn.Linear(35, 2)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        embeddings = x  # Extract embeddings here
        x = self.fc3(x)
        return x, embeddings

model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs, _ = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Extract embeddings from the trained model
model.eval()
_, embeddings = model(X_tensor)

embeddings = embeddings.detach().numpy()

print(f"Shape of embeddings: {embeddings.shape}")

Epoch 1/100, Loss: 0.7033334970474243
Epoch 2/100, Loss: 0.6897017955780029
Epoch 3/100, Loss: 0.6779980063438416
Epoch 4/100, Loss: 0.6676099300384521
Epoch 5/100, Loss: 0.6578643321990967
Epoch 6/100, Loss: 0.6484081149101257
Epoch 7/100, Loss: 0.6389741897583008
Epoch 8/100, Loss: 0.6293782591819763
Epoch 9/100, Loss: 0.6195287108421326
Epoch 10/100, Loss: 0.609406590461731
Epoch 11/100, Loss: 0.5989533066749573
Epoch 12/100, Loss: 0.5881728529930115
Epoch 13/100, Loss: 0.5770559906959534
Epoch 14/100, Loss: 0.5655280351638794
Epoch 15/100, Loss: 0.5534976720809937
Epoch 16/100, Loss: 0.5410261154174805
Epoch 17/100, Loss: 0.5280237197875977
Epoch 18/100, Loss: 0.5145041942596436
Epoch 19/100, Loss: 0.5004948973655701
Epoch 20/100, Loss: 0.4859408140182495
Epoch 21/100, Loss: 0.47097092866897583
Epoch 22/100, Loss: 0.455764502286911
Epoch 23/100, Loss: 0.4403248429298401
Epoch 24/100, Loss: 0.424640953540802
Epoch 25/100, Loss: 0.40875089168548584
Epoch 26/100, Loss: 0.3926490843296

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Train CatBoost model on the entire dataset embeddings
catboost_model = CatBoostClassifier(verbose=0)
catboost_model.fit(embeddings, y)

# Predict on the entire dataset
y_pred_catboost = catboost_model.predict(embeddings)

# Evaluate the CatBoost model
accuracy_catboost = accuracy_score(y, y_pred_catboost)
report_catboost = classification_report(y, y_pred_catboost)

# Perform cross-validation on the entire dataset
cv_scores = cross_val_score(catboost_model, embeddings, y, cv=5, scoring='accuracy', n_jobs=-1)

# Print the results
print(f"CatBoost Accuracy: {accuracy_catboost}")
print(f"CatBoost Classification Report:\n{report_catboost}")
print(f"Cross-Validation Scores - Avg: {cv_scores.mean()}, Min: {cv_scores.min()}, Max: {cv_scores.max()}")

CatBoost Accuracy: 1.0
CatBoost Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       477
         1.0       1.00      1.00      1.00       471

    accuracy                           1.00       948
   macro avg       1.00      1.00      1.00       948
weighted avg       1.00      1.00      1.00       948

Cross-Validation Scores - Avg: 1.0, Min: 1.0, Max: 1.0


In [None]:
data_r2 = round2_data
data_r2 = data_r2.drop(columns = irrelevant_columns)
data_r2_nums = data_r2.select_dtypes(include=['number'])
data_r2_filled = data_r2_nums.fillna(data_r2_nums.mean())

scaler = StandardScaler()
X_r2_scaled = scaler.fit_transform(data_r2_filled)

X_r2_tensor = torch.tensor(X_r2_scaled, dtype=torch.float32)

_, embeddings_r2 = model(X_r2_tensor)
embeddings_r2 = embeddings_r2.detach().numpy()
print(embeddings_r2.shape)

(207, 35)


In [None]:
round2_data['pred_winner'] = catboost_model.predict(embeddings_r2)
round2_data['pred_winner_score'] = catboost_model.predict_proba(embeddings_r2)[:, 1]
round2_data['pred_winner_id'] = round2_data.apply(
    lambda row: row['team1_id'] if row['pred_winner'] == 1 else row['team2_id'], axis=1
)
print(round2_data.shape)

(207, 199)


In [None]:
round2_data.to_csv('r2.csv', index=False)

In [None]:
feature_importances = catboost_model.get_feature_importance()

sorted_indices = np.argsort(-feature_importances)
sorted_embeddings_r2 = embeddings_r2[:, sorted_indices]
sorted_embeddings_r2_df = pd.DataFrame(sorted_embeddings_r2)
sorted_embeddings_r2_df.to_csv('sorted_embeddings_r2.csv', index=False)