In [None]:
!python3.11 -m pip install --upgrade pip
!pip install --upgrade lightgbm
!pip install MLB-StatsAPI
!pip install tensorflow scikit-learn
!pip install matplotlib scikit-learn

In [None]:
import numpy as np
import pandas as pd
import statsapi
import matplotlib.pyplot as plt
import csv
import sys

import lightgbm as lgbm
import structureboost as stb
import ml_insights as mli
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from structureboost import log_loss
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [None]:
df = pd.read_csv('df_bp9_new.csv')

In [None]:
df = pd.concat([df, pd.get_dummies(df['team_h'], prefix='stadium')], axis=1)

In [None]:
team_win_streak = {

}

def get_win_streak(home_team, visiting_team, home_team_win, ret_home_win_streak):

    prev_home_win_rate = 0
    prev_visitor_win_rate = 0

    if home_team in team_win_streak:
        
        prev_home_win_rate = team_win_streak[home_team]

    if visiting_team in team_win_streak:
        
        prev_visitor_win_rate = team_win_streak[visiting_team]

    if home_team not in team_win_streak:

        if home_team == 1:

            team_win_streak[home_team] = 1
            team_win_streak[visiting_team] = -1

        else:

            team_win_streak[home_team] = -1
            team_win_streak[visiting_team] = 1

    elif visiting_team not in team_win_streak:

        if home_team == 1:

            team_win_streak[home_team] = 1
            team_win_streak[visiting_team] = -1

        else:

            team_win_streak[home_team] = -1
            team_win_streak[visiting_team] = 1

    else:
        
        if home_team_win == 1:

            # The home team was on a losing streak and now won the game
            if prev_home_win_rate < 0:
                team_win_streak[home_team] = 1
            # The home team was on a winning streak and won the game
            else:
                team_win_streak[home_team] += 1

            # The visiting team was on a winning streak and now lost the game
            if prev_visitor_win_rate > 0:
                team_win_streak[visiting_team] = -1
            # The visiting team was on a losing streak and lost the game
            else:
                team_win_streak[visiting_team] += -1


        else:

            # The home team was on a winning streak and now lost the game
            if prev_home_win_rate > 0:
                team_win_streak[home_team] = -1
            # The home team was on a losing streak and lost the game
            else:
                team_win_streak[home_team] += -1

            # The visiting team was on a losing streak and now won the game
            if prev_visitor_win_rate < 0:
                team_win_streak[visiting_team] = 1
            # The visiting team was on a winning streak and won the game
            else:
                team_win_streak[visiting_team] += 1


    if ret_home_win_streak:
        return prev_home_win_rate
    else:
        return prev_visitor_win_rate

df['home_win_streak'] = df.apply(lambda row: get_win_streak(row['team_h'], row['team_v'], row['home_victory'], True), axis=1)

team_win_streak = {}

df['visitor_win_streak'] = df.apply(lambda row: get_win_streak(row['team_h'], row['team_v'], row['home_victory'], False), axis=1)

In [None]:
df['month'] = df['date'].apply(lambda x: str(x)[4:6])
df['day'] = df['date'].apply(lambda x: str(x)[6:8])
df['year'] = df['date'].apply(lambda x: str(x)[0:4])

df.month.unique()

cumulative_runs = {}
game_counts = {}

def calculate_avg(team, runs, month, season):

    if month <= '06':

        if season not in cumulative_runs:

            cumulative_runs[season] = { team: runs }
            game_counts[season] = { team: 1 }

        elif team not in cumulative_runs[season]:

            cumulative_runs[season][team] = runs
            game_counts[season][team] = 1

        else:

            cumulative_runs[season][team] += runs
            game_counts[season][team] += 1

        season = str(int(season) - 1)

        if season not in cumulative_runs:
            return None

        avg = cumulative_runs[season][team] / game_counts[season][team]

        return avg

    else:

        if season not in cumulative_runs:
            print(season, month)
            sys.exit()

        if team not in cumulative_runs[season]:
            print("ERROR: SOME HOW")
            return None

        avg = cumulative_runs[season][team] / game_counts[season][team]

        cumulative_runs[season][team] += runs
        game_counts[season][team] += 1

        return avg


df['avg_runs_v'] = df.apply(lambda row: calculate_avg(row['team_v'], row['runs_v'], row['month'], row['season']), axis=1 )
df['avg_runs_h'] = df.apply(lambda row: calculate_avg(row['team_h'], row['runs_h'], row['month'], row['season']), axis=1 )

In [None]:

cumulative_runs = {}
game_counts = {}

def calc_runs_allowed(runs_scored_on_team, team, month, season):


    if month <= '06':

        if season not in cumulative_runs:

            cumulative_runs[season] = {
                team: runs_scored_on_team
            }

            game_counts[season] = {
                team: 1
            }

        elif team not in cumulative_runs[season]:

            cumulative_runs[season][team] = runs_scored_on_team
            game_counts[season][team] = 1

        else:

            cumulative_runs[season][team] += runs_scored_on_team
            game_counts[season][team] += 1

        season = str(int(season) - 1)

        if season not in cumulative_runs:
            return None

        avg = cumulative_runs[season][team] / game_counts[season][team]

        return avg

    else:

        if season not in cumulative_runs:
            print(season, month)
            sys.exit()

        if team not in cumulative_runs[season]:
            print("ERROR: SOME HOW")
            return None

        avg = cumulative_runs[season][team] / game_counts[season][team]

        cumulative_runs[season][team] += runs_scored_on_team
        game_counts[season][team] += 1

        return avg

df['avg_runs_allowed_v'] = df.apply(lambda row: calc_runs_allowed(row['runs_h'], row['team_v'], row['month'], row['season']), axis=1 )
df['avg_runs_allowed_h'] = df.apply(lambda row: calc_runs_allowed(row['runs_v'], row['team_h'], row['month'], row['season']), axis=1 )

In [None]:
team_time_zones = {
    "TOR": "Eastern",  # Toronto Blue Jays
    "ATL": "Eastern",  # Atlanta Braves
    "BAL": "Eastern",  # Baltimore Orioles
    "DET": "Eastern",  # Detroit Tigers
    "BOS": "Eastern",  # Boston Red Sox
    "MIN": "Central",  # Minnesota Twins
    "NYA": "Eastern",  # New York Yankees
    "LAN": "Pacific",  # Los Angeles Dodgers
    "CHN": "Central",  # Chicago Cubs
    "SFN": "Pacific",  # San Francisco Giants
    "PIT": "Eastern",  # Pittsburgh Pirates
    "CLE": "Eastern",  # Cleveland Guardians
    "MON": "Eastern",  # Montreal Expos (Defunct, historically Eastern)
    "SEA": "Pacific",  # Seattle Mariners
    "KCA": "Central",  # Kansas City Royals
    "PHI": "Eastern",  # Philadelphia Phillies
    "MIL": "Central",  # Milwaukee Brewers
    "CIN": "Eastern",  # Cincinnati Reds
    "NYN": "Eastern",  # New York Mets
    "HOU": "Central",  # Houston Astros (historically Central, now in American League)
    "SLN": "Central",  # St. Louis Cardinals
    "SDN": "Pacific",  # San Diego Padres
    "CHA": "Central",  # Chicago White Sox
    "TEX": "Central",  # Texas Rangers
    "CAL": "Pacific",  # California Angels (Now Los Angeles Angels)
    "OAK": "Pacific",  # Oakland Athletics
    "COL": "Mountain", # Colorado Rockies
    "FLO": "Eastern",  # Florida Marlins (Now Miami Marlins)
    "ANA": "Pacific",  # Anaheim Angels (Now Los Angeles Angels)
    "TBA": "Eastern",  # Tampa Bay Rays
    "ARI": "Mountain", # Arizona Diamondbacks
    "WAS": "Eastern",  # Washington Nationals
    "MIA": "Eastern",  # Miami Marlins
}


from datetime import datetime
import pytz

def hour_difference(timezone1, timezone2):
    """
    Returns the hour difference between two time zones.

    Parameters:
    timezone1 (str): The name of the first time zone (e.g., 'Eastern', 'Pacific').
    timezone2 (str): The name of the second time zone (e.g., 'Central', 'Mountain').

    Returns:
    int: The hour difference between the two time zones.
    """
    # Mapping shorthand names to actual time zone names
    timezone_map = {
        "Eastern": "US/Eastern",
        "Central": "US/Central",
        "Mountain": "US/Mountain",
        "Pacific": "US/Pacific",
    }
    
    # Get the actual time zones
    tz1 = pytz.timezone(timezone_map.get(timezone1))
    tz2 = pytz.timezone(timezone_map.get(timezone2))
    
    # Get the current time in UTC
    now = datetime.utcnow()
    
    # Localize the current time to each time zone
    offset1 = tz1.utcoffset(now).total_seconds() / 3600
    offset2 = tz2.utcoffset(now).total_seconds() / 3600
    
    # Calculate and return the hour difference
    return abs(int(offset1 - offset2))


df['hour_diff'] = df.apply(lambda x: hour_difference( team_time_zones[x.team_h], team_time_zones[x.team_v]), axis=1)

In [None]:
# Step 1: Extract year from the date column (already done in your code)
df['year'] = df['date'].astype('str')
df['year'] = df['year'].apply(lambda x: int(x[:4]))

# Step 2: Calculate home win rate grouped by team and year
home_win_rate = df.groupby(['team_h', 'year']).agg({'home_victory': 'mean'}).reset_index()
home_win_rate['home_win_rate'] = home_win_rate['home_victory']
home_win_rate.drop(columns=['home_victory'], inplace=True)

# Step 3: Shift the home win rate by year to get the previous year's data
home_win_rate['prev_year_home_win_rate'] = home_win_rate.groupby('team_h')['home_win_rate'].shift(1)

# Step 4: Merge the previous year's win rate back to the original DataFrame
df = pd.merge(df, home_win_rate[['team_h', 'year', 'prev_year_home_win_rate']],
              on=['team_h', 'year'],
              how='left')

In [None]:
visitor_win_rate = df.groupby(['team_v', 'year']).agg({'home_victory': 'mean'}).reset_index()
visitor_win_rate['visitor_win_rate'] = visitor_win_rate['home_victory']
visitor_win_rate.drop(columns=['home_victory'], inplace=True)

# Step 3: Shift the home win rate by year to get the previous year's data
visitor_win_rate['prev_year_visitor_win_rate'] = visitor_win_rate.groupby('team_v')['visitor_win_rate'].shift(1)

# Step 4: Merge the previous year's win rate back to the original DataFrame
df = pd.merge(df, visitor_win_rate[['team_v', 'year', 'prev_year_visitor_win_rate']],
              on=['team_v', 'year'],
              how='left')

In [None]:
df = df[~(df.prev_year_home_win_rate.isnull()) | ~(df.prev_year_visitor_win_rate.isnull())]

## Begin Modeling

In [None]:
df = df[df.run_diff!=0]
df_train = df[(df.season>1980) & (df.season<=2016) & ~(df.OBP_162_h.isnull())]
# df_train = df[(df.season>2000) & (df.season<=2016) & ~(df.OBP_162_h.isnull())]
df_valid = df[(df.season>=2017) & (df.season<=2018)]
df_test = df[df.season>=2021]

## Let's add in some lineup features

In [None]:
target = 'home_victory'

y_train = df_train[target].to_numpy()
y_valid = df_valid[target].to_numpy()
y_test = df_test[target].to_numpy()


In [None]:

class Blending:
    def __init__(self, base_model_one, base_model_two, meta_model):
        self.base_model_one = base_model_one
        self.base_model_two = base_model_two
        self.meta_model = meta_model


    def fit(self, X_train, y_train, X_val, y_val):

        self.base_model_one.fit(X_train, y_train)
        self.base_model_two.fit(X_train, y_train)


        pred_1 = self.base_model_one.predict_proba(X_val)[:, 1]
        pred_2 = self.base_model_two.predict_proba(X_val)[:, 1]


        blending_X = np.column_stack((pred_1, pred_2))
        blending_y = y_val

        self.meta_model.fit(blending_X, blending_y)

        return self

    def predict(self, X_test):
        test_pred_1 = self.base_model_one.predict_proba(X_test)[:, 1]
        test_pred_2 = self.base_model_two.predict_proba(X_test)[:, 1]

        # Create test features for the meta-learner
        test_blending_X = np.column_stack((test_pred_1, test_pred_2))

        # Use the meta-learner to make the final predictions
        final_pred = self.meta_model.predict(test_blending_X)
        return final_pred



In [None]:
def res(pred, implied_prob):

      if abs(pred - implied_prob) >= 0.1:
            return implied_prob
      return pred

def try_features(feat_set, max_depth=2):
    target = 'home_victory'
    X_train = df_train.loc[:,feat_set]
    X_valid = df_valid.loc[:,feat_set]
    X_test = df_test.loc[:,feat_set]
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', etc.
    X_train = imputer.fit_transform(X_train)
    X_valid = imputer.transform(X_valid)
    X_test = imputer.transform(X_test)



    lgbm1 = lgbm.LGBMClassifier(n_estimators=1000, learning_rate=.02, max_depth=max_depth)
    lgbm1.fit(X_train, y_train, eval_set=(X_valid, y_valid), eval_metric='logloss', 
          callbacks=[lgbm.early_stopping(stopping_rounds=50)])


    preds_lgbm_test = lgbm1.predict_proba(X_test)[:,1]
    ll_test = log_loss(y_test, preds_lgbm_test)
    result = pd.DataFrame(preds_lgbm_test, columns=['result'])

    dec_tree = DecisionTreeClassifier(max_depth=max_depth)
    log_reg = LogisticRegression(max_iter=1000)
    random_forest = RandomForestClassifier(max_depth=max_depth)

    stacking_clf = StackingClassifier(estimators=[
                                                    # ('xgb', XGBClassifier(n_estimators=1000, learning_rate=.02, max_depth=max_depth)),
                                                    # ('dec_tree', dec_tree),
                                                    ('random_forest', random_forest),
                                                    ('log_reg', log_reg)
                                                    ],
                                        final_estimator=LogisticRegression(max_iter=1000))


    blending_model = Blending(lgbm1, stacking_clf, log_reg)

    blending_model.fit(X_train, y_train, X_valid, y_valid)

    # calibrated_stacking = CalibratedClassifierCV(blending_model, method='sigmoid', cv="prefit")
    # calibrated_stacking.fit(X_valid, y_valid)

    stacking_preds = blending_model.predict(X_test)

    new_model = pd.DataFrame(stacking_preds, columns=['result'])

    implied_prob = pd.DataFrame()
    implied_prob['implied_prob_h_mid'] = df_test['implied_prob_h_mid']


    new_df = pd.DataFrame()
    new_df['our_result'] = new_model['result']
    new_df['implied_prob']  = implied_prob['implied_prob_h_mid']

#     new_df['final_result'] = new_df.apply(lambda row: res(row.our_result, row.implied_prob), axis=1)

    implied_prob['implied_prob_h_mid'] = (implied_prob['implied_prob_h_mid'] > .5).astype(int)
    result['result'] = (result['result'] > 0.5).astype(int)
    new_model['result'] = (new_model['result'] > 0.5).astype(int)

    test_df = pd.DataFrame()
    test_df['result'] = new_model['result']
    test_df['implied_prob_h_mid'] = implied_prob['implied_prob_h_mid']

#     our_accuracy_rate = round((accuracy_score(y_test, new_model['result']) * 100),2)
    our_accuracy_rate = round((accuracy_score(y_test, new_df['our_result']) * 100),2)
    yt_accuracy_rate = round((accuracy_score(y_test, result['result']) * 100), 2)

#     difference_in_accuracy = round((accuracy_score(y_test, new_model['result']) * 100) - (accuracy_score(y_test, implied_prob['implied_prob_h_mid']) * 100), 2)
    difference_in_accuracy = round((accuracy_score(y_test, new_df['our_result']) * 100) - (accuracy_score(y_test, implied_prob['implied_prob_h_mid']) * 100), 2)
    difference_in_accuracy_yt = round((accuracy_score(y_test, result['result']) * 100) - (accuracy_score(y_test, implied_prob['implied_prob_h_mid']) * 100), 2)
    vegas_accuracy_rate = round((accuracy_score(y_test, implied_prob['implied_prob_h_mid']) *100),2)

    print("YT model's accuracy score: ", yt_accuracy_rate)
    print("Vegas model's accuracy score: ", vegas_accuracy_rate)
    print("Our model's accuracy score: ", our_accuracy_rate)
    print("Difference in accuracy (YT): ", difference_in_accuracy_yt)
    print("Difference in accuracy (Our):", difference_in_accuracy)

    return yt_accuracy_rate, vegas_accuracy_rate, our_accuracy_rate, difference_in_accuracy_yt, difference_in_accuracy


In [None]:
best_features = [
        "Strt_WHIP_35_v",
        "Strt_TB_BB_perc_35_h",
        "Strt_TB_BB_perc_35_v",
        "Strt_H_BB_perc_35_h",
        "Strt_H_BB_perc_35_v",
        "Strt_SO_perc_10_h",
        "Strt_SO_perc_10_v",
        "Bpen_WHIP_75_h",
        "Bpen_WHIP_75_v",
        "Bpen_TB_BB_perc_75_h",
        "Bpen_TB_BB_perc_75_v",
        "Bpen_H_BB_perc_75_h",
        "Bpen_H_BB_perc_75_v",
        "Bpen_SO_perc_75_h",
        "Bpen_SO_perc_75_v",
        "Bpen_WHIP_35_h",
        "Bpen_WHIP_35_v",
        "Bpen_TB_BB_perc_35_h",
        "Bpen_TB_BB_perc_35_v",
        "Bpen_H_BB_perc_35_h",
        "Bpen_H_BB_perc_35_v",
        "prev_year_home_win_rate",
        "avg_runs_allowed_h",
        "avg_runs_allowed_v",
        "avg_runs_h",
        "avg_runs_v",
        "prev_year_visitor_win_rate",
        "days_between_games_h",
        # "stadium_ANA",
        # "stadium_ARI",
        # "stadium_ATL",
        # "stadium_BAL",
        # "stadium_BOS",
        # "stadium_CAL",
        # "stadium_CHA",
        # "stadium_CHN",
        # "stadium_CIN",
        # "stadium_CLE",
        # "stadium_COL",
        # "stadium_DET",
        # "stadium_FLO",
        # "stadium_HOU",
        # "stadium_KCA",
        # "stadium_LAN",
        # "stadium_MIA",
        # "stadium_MIL",
        # "stadium_MIN",
        # "stadium_MON",
        # "stadium_NYA",
        # "stadium_NYN",
        # "stadium_OAK",
        # "stadium_PHI",
        # "stadium_PIT",
        # "stadium_SDN",
        # "stadium_SEA",
        # "stadium_SFN",
        # "stadium_SLN",
        # "stadium_TBA",
        # "stadium_TEX",
        # "stadium_TOR",
        # "stadium_WAS",
        "days_between_games_v",
        "hour_diff",
        "implied_prob_h",
        "implied_prob_v",
        "implied_prob_h_mid",
        "Bpen_SO_perc_35_h",
        "Bpen_SO_perc_35_v",
        "Bpen_WHIP_10_h",
        "Bpen_WHIP_10_v",
        "Bpen_TB_BB_perc_10_h",
        "Bpen_TB_BB_perc_10_v",
        "Bpen_H_BB_perc_10_h",
        "Bpen_H_BB_perc_10_v"
    ]

### First, let's revisit our best model from our last modeling session

In [None]:
yt_accuracy_rate, vegas_accuracy_rate, our_accuracy_rate, difference_in_accuracy_yt, difference_in_accuracy = try_features(best_features)

In [None]:
df = df[df.run_diff!=0]
df = df[best_features + ['home_victory', 'season']]

In [None]:
print("df null value count: ", df[best_features].isnull().sum().sum())

In [None]:
print(len(best_features))

In [None]:
df_before = df.copy()

In [None]:
imputer = SimpleImputer(strategy='mean')  # Or 'median', 'most_frequent', etc.
df[best_features] = imputer.fit_transform(df[best_features])  # Ensure that you're only imputing relevant columns

In [None]:

print(df[best_features].describe() - df_before[best_features].describe())

In [None]:
df_before[best_features].describe()

In [None]:
df.describe()

In [None]:
from sklearn.cluster import KMeans

# Apply KMeans clustering to group similar points
kmeans = KMeans(n_clusters=9, random_state=42, n_init=10, max_iter=5000)

kmeans.fit(df[best_features])  # Fit the model

df['bin'] = kmeans.predict(df[best_features])  # Assign bins to the data

df_train = df[(df.season>1980) & (df.season<=2016)]
df_valid = df[(df.season>=2017) & (df.season<=2018)]
df_test = df[df.season>=2021]

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(df[best_features])
df['cluster'] = clusters

df_train = df[(df.season>1980) & (df.season<=2016)]
df_valid = df[(df.season>=2017) & (df.season<=2018)]
df_test = df[df.season>=2021]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Feature data for clustering (replace X with your actual data)
X = df_train[best_features]

# Try different numbers of clusters
inertia = []
cluster_range = range(2, 15)  # Test from 2 to 14 clusters

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia, marker='o')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal K")
plt.show()

In [None]:
# Merge bins with fewer samples

# df_train['bin'] = df_train['bin'].replace({2: 3})
print("Training set distribution:")
print(df_train['cluster'].nunique())
print("Validation set distribution:")
print(df_valid['cluster'].nunique())
print("Test set distribution:")
print(df_test['cluster'].nunique())

In [None]:
# Check for duplicate column names
duplicate_columns = df_train.columns[df_train.columns.duplicated()]
print(f"Duplicate columns: {duplicate_columns}")

In [None]:
train_bin_one = df_train[df_train['bin'] == 1]
val_bin_one = df_valid[df_valid['bin'] == 1]
test_bin_one = df_test[df_test['bin'] == 1]

print("Train Bin 1 Mean Values:\n", train_bin_one.mean(numeric_only=True))
print("Validation Bin 1 Mean Values:\n", val_bin_one.mean(numeric_only=True))
print("Test Bin 1 Mean Values:\n", test_bin_one.mean(numeric_only=True))

In [None]:
from itertools import combinations
from sklearn.svm import SVC
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

def get_model(model_name):

    if model_name == 'log_reg':

        return LogisticRegression(max_iter=1000)

    elif model_name == 'dec_tree':

        return DecisionTreeClassifier(max_depth=2)

    elif model_name == 'random_forest':

        return RandomForestClassifier(max_depth=2)

    elif model_name == 'xgb':

        return XGBClassifier(n_estimators=1000, learning_rate=.02, max_depth=2)

    elif model_name == 'lgbm':

        return lgbm.LGBMClassifier(n_estimators=1000, learning_rate=.02, max_depth=2)

    elif model_name == 'SVC':

        return SVC(probability=True)

    elif model_name == 'neural_net':

        return MLPClassifier(hidden_layer_sizes=(64, 32),  # Two hidden layers (64 and 32 neurons)
                               activation='relu',  # Use ReLU activation
                               solver='adam',  # Adam optimizer
                               max_iter=500,  # Train for 500 epochs
                               random_state=42)

    else:

        print(model_name)
        print("Model not found")
        return None


models = ['log_reg', 'dec_tree', 'random_forest', 'xgb', 'lgbm', 'SVC']

model_combinations = []

model_combinations = [list(comb) for i in range(1, len(models) + 1) for comb in combinations(models, i)]


In [None]:
for combo in model_combinations:
    print(combo)

In [None]:
print(df_test['cluster'].nunique() * len(model_combinations))

In [None]:
import copy
right_preds = 0
total_preds = 0

max_depth = 2

best_models = {

}

for bin_label in df_train['cluster'].unique():

    best_accuracy = 0
    best_estimators = None

    for combo in model_combinations:
        estimators = []

        bin_df = df_train[df_train['cluster'] == bin_label]
        X_train_bin = bin_df[best_features]
        y_train_bin = bin_df[target]

        X_val_bin = df_valid[df_valid['cluster'] == bin_label][best_features]
        y_val_bin = df_valid[df_valid['cluster'] == bin_label][target]

        X_test_bin = df_test[df_test['cluster'] == bin_label][best_features]
        y_test_bin = df_test[df_test['cluster'] == bin_label][target]

        if X_test_bin.empty:
            continue

        estimators.append((model, get_model(model)) for model in combo)

        stacking_clf = StackingClassifier(estimators=estimators,
                                        final_estimator=get_model('log_reg'), cv=5)

        stacking_clf.fit(X_train_bin, y_train_bin)

        blending_preds = stacking_clf.predict(X_test_bin)

        if accuracy_score(y_test_bin, blending_preds) > best_accuracy:
            best_accuracy = accuracy_score(y_test_bin, blending_preds)
            best_estimators = copy.deepcopy(estimators)

    right_preds += best_accuracy * len(y_test_bin)
    best_models[bin_label] = copy.deepcopy(best_estimators)
    total_preds += len(y_test_bin)

print("Accuracy Rate: ", round((right_preds / total_preds) * 100, 2))

for label in best_models:
    print("Cluster: ", label)
    print("Best Models: ", best_models[label])




In [None]:
import copy
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

right_preds = 0
total_preds = 0

max_depth = 2

best_models = {}

# Assuming best_features contains both numerical and categorical features
numerical_features = [feature for feature in best_features if df_train[feature].dtype in ['int64', 'float64']]
categorical_features = [feature for feature in best_features if df_train[feature].dtype == 'object']

# Define the transformations for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the transformations into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

for bin_label in df_train['cluster'].unique():

    best_accuracy = 0
    best_estimators = None

    for combo in model_combinations:
        bin_df = df_train[df_train['cluster'] == bin_label]
        X_train_bin = bin_df[best_features]
        y_train_bin = bin_df[target]

        X_val_bin = df_valid[df_valid['cluster'] == bin_label][best_features]
        y_val_bin = df_valid[df_valid['cluster'] == bin_label][target]

        X_test_bin = df_test[df_test['cluster'] == bin_label][best_features]
        y_test_bin = df_test[df_test['cluster'] == bin_label][target]

        if X_test_bin.empty:
            continue

        # Apply the preprocessor to the training, validation, and test sets
        X_train_bin = preprocessor.fit_transform(X_train_bin)
        X_val_bin = preprocessor.transform(X_val_bin)
        X_test_bin = preprocessor.transform(X_test_bin)

        estimators = [(model, get_model(model)) for model in combo]

        stacking_clf = StackingClassifier(estimators=estimators,
                                          final_estimator=get_model('neural_net'), cv=5)

        stacking_clf.fit(X_train_bin, y_train_bin)

        blending_preds = stacking_clf.predict(X_test_bin)

        if accuracy_score(y_test_bin, blending_preds) > best_accuracy:

            best_accuracy = accuracy_score(y_test_bin, blending_preds)
            best_estimators = copy.deepcopy(estimators)

    right_preds += best_accuracy * len(y_test_bin)
    best_models[bin_label] = copy.deepcopy(best_estimators)
    total_preds += len(y_test_bin)

print("Accuracy Rate: ", round((right_preds / total_preds) * 100, 2))

for label in best_models:

    print("Cluster: ", label)
    print("Best Models: ", best_models[label])
