In [64]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [65]:
import pandas as pd

# Import MTeams.csv
teams_df = pd.read_csv("MTeams.csv")

# Import MSeasons.csv
seasons_df = pd.read_csv("MSeasons.csv")

# Import MNCAATourneySeeds.csv
seeds_df = pd.read_csv("MNCAATourneySeeds.csv")

# Import MRegularSeasonCompactResults.csv
compact_results_df = pd.read_csv("MRegularSeasonCompactResults.csv")

# Import MRegularSeasonDetailedResults.csv
detailed_results_regular_df = pd.read_csv("MRegularSeasonDetailedResults.csv")

detailed_results_tourney_df = pd.read_csv("MNCAATourneyDetailedResults.csv")
# Import MMasseyOrdinals.csv
massey_ordinals_df = pd.read_csv("MMasseyOrdinals.csv")

# Import MTeamCoaches.csv
team_coaches_df = pd.read_csv("MTeamCoaches.csv")

In [66]:
detailed_results_tourney_df[detailed_results_tourney_df.Season == 2023]

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
1248,2023,134,1338,60,1280,59,N,0,21,54,...,23,5,7,14,28,14,14,1,4,18
1249,2023,134,1394,75,1369,71,N,0,22,57,...,26,9,20,6,22,17,12,11,2,31
1250,2023,135,1113,98,1305,73,N,0,35,55,...,21,12,14,5,18,19,11,3,0,18
1251,2023,135,1192,84,1411,61,N,0,23,46,...,17,8,9,8,19,16,14,8,0,24
1252,2023,136,1104,96,1394,75,N,0,33,70,...,20,16,18,11,23,8,6,4,3,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2023,146,1274,88,1400,81,N,0,29,49,...,25,11,15,9,14,20,13,6,1,23
1311,2023,146,1361,57,1166,56,N,0,25,66,...,17,10,11,8,24,9,9,3,2,11
1312,2023,152,1163,72,1274,59,N,0,28,57,...,20,12,12,12,17,10,9,8,1,12
1313,2023,152,1361,72,1194,71,N,0,25,57,...,22,16,21,7,24,6,9,6,2,17


Let us understand the impact of well renowned coaches on the wins of the team. 

In [67]:
indiana_team_id = teams_df.loc[
    teams_df["TeamName"].str.contains("Indiana"), "TeamID"
].values[0]


print(indiana_team_id)

1231


In [68]:
team_name = teams_df.loc[teams_df["TeamID"] == indiana_team_id, "TeamName"].values[0]
print(team_name)

Indiana


In [69]:
indiana_coaches_df = team_coaches_df[team_coaches_df["TeamID"] == 1231]

In [70]:
indiana_results_df = detailed_results_regular_df[
    (detailed_results_regular_df["WTeamID"] == 1231)
    | (detailed_results_regular_df["LTeamID"] == 1231)
]

In [71]:
combined_df = indiana_results_df.merge(
    indiana_coaches_df[["Season", "CoachName"]], on="Season", how="left"
)

## Understanding the Coach Winning Percentage for Team Indiana

In [72]:
combined_df["CoachName"].value_counts()

CoachName
tom_crean         289
archie_miller     122
mike_davis        118
mike_woodson       99
kelvin_sampson     62
dan_dakich         32
Name: count, dtype: int64

In [73]:
combined_df[combined_df["WTeamID"] == 1231]["CoachName"].value_counts()

CoachName
tom_crean         159
mike_davis         67
archie_miller      65
mike_woodson       61
kelvin_sampson     45
dan_dakich         25
Name: count, dtype: int64

In [74]:
combined_df[combined_df["LTeamID"] == 1231]["CoachName"].value_counts()

CoachName
tom_crean         130
archie_miller      57
mike_davis         51
mike_woodson       38
kelvin_sampson     17
dan_dakich          7
Name: count, dtype: int64

In [75]:
coach_stats = combined_df.groupby("CoachName").apply(
    lambda x: (x["WTeamID"] == 1231).mean()
)

In [76]:
coach_stats

CoachName
archie_miller     0.532787
dan_dakich        0.781250
kelvin_sampson    0.725806
mike_davis        0.567797
mike_woodson      0.616162
tom_crean         0.550173
dtype: float64

We see that Coach Dan Dakich has more success rate. 

## Classifying playing statistics with Coaches

In [77]:
import pandas as pd

# Assuming combined_df is your existing DataFrame
# Create an empty list to store dictionaries
data = []

# Filter rows where WTeamID or LTeamID is 1231
team_1231_df = combined_df[
    (combined_df["WTeamID"] == 1231) | (combined_df["LTeamID"] == 1231)
]

# Iterate over each row
for index, row in team_1231_df.iterrows():
    if row["WTeamID"] == 1231:
        # Store values of columns starting with 'W' and rename them
        values = {
            col[1:]: row[col] for col in team_1231_df.columns if col.startswith("W")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]
    else:
        # Store values of columns starting with 'L' and rename them
        values = {
            col[1:]: row[col] for col in team_1231_df.columns if col.startswith("L")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]

    # Append the values to the list
    data.append(values)

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data)

new_df.head()

Unnamed: 0,TeamID,Score,Loc,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,CoachName
0,1231,84,N,28,64,12,26,16,25,18,33,18,19,3,8,20,mike_davis
1,1231,76,N,27,63,12,27,10,16,9,27,18,7,3,6,27,mike_davis
2,1231,70,N,20,49,8,25,22,34,10,34,10,14,5,4,23,mike_davis
3,1231,84,H,30,63,9,23,15,22,11,32,17,15,9,3,12,mike_davis
4,1231,80,N,23,77,7,29,27,39,20,29,19,4,11,10,13,mike_davis


In [78]:
from sklearn.neural_network import MLPClassifier

X = new_df.select_dtypes(include="number")

# Assign the 'CoachName' column as the target variable
y = new_df["CoachName"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Create a neural network classifier
# You can adjust the hidden_layer_sizes, activation, solver, alpha, etc., as needed
model = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation="relu",
    solver="adam",
    alpha=0.0001,
    batch_size="auto",
    learning_rate="constant",
    learning_rate_init=0.001,
    max_iter=200,
    shuffle=True,
    random_state=42,
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.33793103448275863


In [79]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import time

X = new_df.select_dtypes(include="number")

# Assign the 'CoachName' column as the target variable
y = new_df["CoachName"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Define the parameter distributions to sample from
param_dist = {
    "hidden_layer_sizes": [(50,), (100,), (150,)],
    "alpha": uniform(loc=1e-6, scale=1e-3 - 1e-6),  # alpha between 1e-6 and 1e-3
    "learning_rate_init": uniform(
        loc=1e-4, scale=1e-1 - 1e-4
    ),  # learning_rate_init between 1e-4 and 1e-1
}

# Create a randomized search object
random_search = RandomizedSearchCV(
    estimator=MLPClassifier(
        max_iter=200, solver="adam", activation="relu", random_state=42
    ),
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    random_state=42,
)

# Start timing
start_time = time.time()

# Perform randomized search to find the best parameters
random_search.fit(X_train, y_train)

# End timing
end_time = time.time()
total_time = end_time - start_time

# Print the best parameters found
print("Best parameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the testing data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Total time for tuning and modeling:", total_time, "seconds")

Best parameters: {'alpha': 0.0009699399423098324, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.021312677156759788}
Accuracy: 0.2
Total time for tuning and modeling: 1.1401238441467285 seconds


In [80]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Using interpretable model to find important features

In [81]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

param_dist = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"],
}


# Create a randomized search object
random_search_dt = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    n_jobs=-1,
    random_state=42,
)

# Start timing
start_time = time.time()

# Perform randomized search to find the best parameters
with tqdm(total=random_search_dt.n_iter, desc="Parameter optimization") as pbar:
    random_search_dt.fit(X_train, y_encoded)
    pbar.update(1)

# End timing
end_time = time.time()
total_time = end_time - start_time

# Print the best parameters found
print("Best parameters:", random_search_dt.best_params_)

# Get the best model
best_model_dt = random_search_dt.best_estimator_

# Make predictions on the testing data using the best model
y_pred_dt = best_model_dt.predict(X_test)

# Decode the predicted labels back to their original form
y_pred_decoded_dt = label_encoder.inverse_transform(y_pred_dt)

# Calculate the accuracy of the best model
accuracy_dt = accuracy_score(y_test, y_pred_decoded_dt)
print("Accuracy:", accuracy_dt)
print("Total time for tuning and modeling:", total_time, "seconds")

importance_scores = best_model_dt.feature_importances_

Parameter optimization:   1%|          | 1/100 [00:00<00:48,  2.04it/s]

Best parameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'gini'}
Accuracy: 0.45517241379310347
Total time for tuning and modeling: 0.49660325050354004 seconds





In [82]:
from sklearn.tree import DecisionTreeClassifier

importance_scores
# Create a DecisionTreeClassifier object
clf = DecisionTreeClassifier()

# Fit the model on your data
clf.fit(X, y)

# Get the feature importances
importances = clf.feature_importances_

# Create a dictionary of feature names and their importances
feature_importances = dict(zip(X.columns, importance_scores))

# Sort the features by importance in descending order
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

PF: 0.4055444021353306
OR: 0.30719637875996464
FGA: 0.15989586133851472
Blk: 0.12736335776619015
TeamID: 0.0
Score: 0.0
FGM: 0.0
FGM3: 0.0
FGA3: 0.0
FTM: 0.0
FTA: 0.0
DR: 0.0
Ast: 0.0
TO: 0.0
Stl: 0.0


> There is a 45% chance of prediction given the parameters

> The most important features are PF, OR FGA and Blk that changes w.r.t each coach for team Indiana.

In [83]:
# Import the necessary libraries
from sklearn.preprocessing import OneHotEncoder

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Reshape the CoachName column to a 2D array
coach_names = new_df["CoachName"].values.reshape(-1, 1)

# Encode the CoachName column using one-hot encoding
coach_names_encoded = encoder.fit_transform(coach_names)

# Create a DataFrame from the encoded CoachName column
coach_names_encoded_df = pd.DataFrame(
    coach_names_encoded, columns=encoder.categories_[0]
)

# Concatenate the encoded CoachName DataFrame with the original combined_df
combined_df_encoded = pd.concat([new_df, coach_names_encoded_df], axis=1)

# Remove the original CoachName column from the combined_df_encoded
combined_df_encoded.drop("CoachName", axis=1, inplace=True)



# Home Field Advantage

In [84]:
detailed_results_regular_df.WLoc.value_counts()

WLoc
H    66598
A    34979
N    11664
Name: count, dtype: int64

In [85]:
H_count = detailed_results_regular_df[detailed_results_regular_df["WLoc"] == "H"].shape[
    0
]


A_count = detailed_results_regular_df[detailed_results_regular_df["WLoc"] == "A"].shape[
    0
]


total_count = H_count + A_count


percentage_H = (H_count / total_count) * 100
percentage_H

65.56405485493764

There is a 65% chance of winning at home court. 

In [86]:
percentage_A = (A_count / total_count) * 100
percentage_A

34.43594514506237

There is a 35 % chance of losing in the home court. 

In [87]:
unc_team_id = teams_df.loc[
    teams_df["TeamName"].str.contains("UNC Asheville"), "TeamID"
].values[0]

print(unc_team_id)

1421


In [88]:
teams_df[teams_df["TeamID"] == 1421]

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
320,1421,UNC Asheville,1987,2024


In [89]:
unc_coaches_df = team_coaches_df[team_coaches_df["TeamID"] == unc_team_id]

In [90]:
unc_coaches_df["CoachName"].value_counts()

CoachName
eddie_biedenbach     17
mike_morell           6
don_doucette          5
nicholas_mcdevitt     5
randy_wiel            3
jerry_green           2
Name: count, dtype: int64

In [91]:
unc_results_df = detailed_results_regular_df[
    (detailed_results_regular_df["WTeamID"] == 1421)
    | (detailed_results_regular_df["LTeamID"] == 1421)
]

In [92]:
combined_df = unc_results_df.merge(
    unc_coaches_df[["Season", "CoachName"]], on="Season", how="left"
)


combined_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,CoachName
0,2003,18,1277,66,1421,52,H,0,25,51,...,6,6,7,17,13,16,11,1,21,eddie_biedenbach
1,2003,20,1278,87,1421,81,H,0,27,57,...,17,24,18,24,12,15,4,4,27,eddie_biedenbach
2,2003,28,1421,77,1441,72,H,0,26,59,...,17,23,12,26,10,12,5,3,22,eddie_biedenbach
3,2003,33,1190,87,1421,69,H,0,33,70,...,14,17,11,22,12,20,6,5,17,eddie_biedenbach
4,2003,40,1421,99,1144,89,H,1,33,68,...,13,22,11,23,21,17,7,4,21,eddie_biedenbach


In [93]:
combined_df["CoachName"].value_counts()

CoachName
eddie_biedenbach     307
mike_morell          167
nicholas_mcdevitt    149
Name: count, dtype: int64

In [94]:
import pandas as pd

# Assuming combined_df is your existing DataFrame
# Create an empty list to store dictionaries
data = []

# Filter rows where WTeamID or LTeamID is 1421
team_1231_df = combined_df[
    (combined_df["WTeamID"] == 1421) | (combined_df["LTeamID"] == 1421)
]

# Iterate over each row
for index, row in team_1231_df.iterrows():
    if row["WTeamID"] == 1421:
        # Store values of columns starting with 'W' and rename them
        values = {
            col[1:]: row[col] for col in team_1231_df.columns if col.startswith("W")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]
    else:
        # Store values of columns starting with 'L' and rename them
        values = {
            col[1:]: row[col] for col in team_1231_df.columns if col.startswith("L")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]

    # Append the values to the list
    data.append(values)

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data)

In [95]:
# Import the necessary libraries
from sklearn.preprocessing import OneHotEncoder

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Reshape the CoachName column to a 2D array
coach_names = new_df["CoachName"].values.reshape(-1, 1)

# Encode the CoachName column using one-hot encoding
coach_names_encoded = encoder.fit_transform(coach_names)

# Create a DataFrame from the encoded CoachName column
coach_names_encoded_df = pd.DataFrame(
    coach_names_encoded, columns=encoder.categories_[0]
)

# Concatenate the encoded CoachName DataFrame with the original combined_df
combined_df_encoded = pd.concat([new_df, coach_names_encoded_df], axis=1)

# Remove the original CoachName column from the combined_df_encoded
combined_df_encoded.drop("CoachName", axis=1, inplace=True)



In [96]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = new_df.select_dtypes(include="number")

# Assign the 'CoachName' column as the target variable
y = new_df["CoachName"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_train)

# Create a decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

param_dist = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"],
}


# Create a randomized search object
random_search_dt = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    n_jobs=-1,
    random_state=42,
)

# Start timing
start_time = time.time()

# Perform randomized search to find the best parameters
with tqdm(total=random_search_dt.n_iter, desc="Parameter optimization") as pbar:
    random_search_dt.fit(X_train, y_encoded)
    pbar.update(1)

# End timing
end_time = time.time()
total_time = end_time - start_time

# Print the best parameters found
print("Best parameters:", random_search_dt.best_params_)

# Get the best model
best_model_dt = random_search_dt.best_estimator_

# Make predictions on the testing data using the best model
y_pred_dt = best_model_dt.predict(X_test)

# Decode the predicted labels back to their original form
y_pred_decoded_dt = label_encoder.inverse_transform(y_pred_dt)

# Calculate the accuracy of the best model
accuracy_dt = accuracy_score(y_test, y_pred_decoded_dt)
print("Accuracy:", accuracy_dt)
print("Total time for tuning and modeling:", total_time, "seconds")

importance_scores = best_model_dt.feature_importances_

Parameter optimization:   1%|          | 1/100 [00:00<00:44,  2.21it/s]

Best parameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'gini'}
Accuracy: 0.568
Total time for tuning and modeling: 0.4530632495880127 seconds





In [97]:
from sklearn.tree import DecisionTreeClassifier

importance_scores
# Create a DecisionTreeClassifier object
clf = DecisionTreeClassifier()

# Fit the model on your data
clf.fit(X, y)

# Get the feature importances
importances = clf.feature_importances_

# Create a dictionary of feature names and their importances
feature_importances = dict(zip(X.columns, importance_scores))

# Sort the features by importance in descending order
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

OR: 0.4355721670710489
FGA3: 0.17613633547514979
Ast: 0.14369185837027748
PF: 0.11253461708543075
Score: 0.08033691818975232
FTA: 0.05172810380834078
TeamID: 0.0
FGM: 0.0
FGA: 0.0
FGM3: 0.0
FTM: 0.0
DR: 0.0
TO: 0.0
Stl: 0.0
Blk: 0.0


### Purdue

In [98]:
team_id = teams_df.loc[teams_df["TeamName"].str.contains("Purdue"), "TeamID"].values[0]

print(team_id)

1345


In [99]:
teams_df[teams_df["TeamID"] == team_id]

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
244,1345,Purdue,1985,2024


In [100]:
results_df = detailed_results_regular_df[
    (detailed_results_regular_df["WTeamID"] == team_id)
    | (detailed_results_regular_df["LTeamID"] == team_id)
]

In [101]:
coaches_df = team_coaches_df[team_coaches_df["TeamID"] == team_id]

In [102]:
combined_df = results_df.merge(
    coaches_df[["Season", "CoachName"]], on="Season", how="left"
)

combined_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,CoachName
0,2003,18,1345,73,1275,46,H,0,26,51,...,8,14,10,13,9,16,2,1,18,gene_keady
1,2003,26,1345,86,1257,84,N,0,27,61,...,28,38,18,25,16,11,5,7,29,gene_keady
2,2003,29,1462,74,1345,59,H,0,27,56,...,5,7,9,19,15,15,4,0,15,gene_keady
3,2003,32,1345,85,1292,56,H,0,27,57,...,7,16,12,26,13,21,4,2,29,gene_keady
4,2003,33,1345,95,1360,65,H,0,34,65,...,11,18,21,24,14,23,8,3,18,gene_keady
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,2024,117,1345,80,1277,74,H,0,25,49,...,15,17,11,21,11,7,6,3,25,matt_painter
693,2024,120,1345,77,1228,71,A,0,29,60,...,17,21,10,17,9,12,7,1,15,matt_painter
694,2024,125,1345,78,1458,70,H,0,26,53,...,11,14,7,15,10,8,11,0,19,matt_painter
695,2024,130,1345,67,1277,62,N,0,22,53,...,10,12,13,22,15,13,4,8,29,matt_painter


In [103]:
import pandas as pd

# Assuming combined_df is your existing DataFrame
# Create an empty list to store dictionaries
data = []

# Filter rows where WTeamID or LTeamID is team_id
team_id_df = combined_df[
    (combined_df["WTeamID"] == team_id) | (combined_df["LTeamID"] == team_id)
]

# Iterate over each row
for index, row in team_id_df.iterrows():
    if row["WTeamID"] == 1421:
        # Store values of columns starting with 'W' and rename them
        values = {
            col[1:]: row[col] for col in team_id_df.columns if col.startswith("W")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]
    else:
        # Store values of columns starting with 'L' and rename them
        values = {
            col[1:]: row[col] for col in team_id_df.columns if col.startswith("L")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]

    # Append the values to the list
    data.append(values)

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data)

In [104]:
# Import the necessary libraries
from sklearn.preprocessing import OneHotEncoder

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Reshape the CoachName column to a 2D array
coach_names = new_df["CoachName"].values.reshape(-1, 1)

# Encode the CoachName column using one-hot encoding
coach_names_encoded = encoder.fit_transform(coach_names)

# Create a DataFrame from the encoded CoachName column
coach_names_encoded_df = pd.DataFrame(
    coach_names_encoded, columns=encoder.categories_[0]
)

# Concatenate the encoded CoachName DataFrame with the original combined_df
combined_df_encoded = pd.concat([new_df, coach_names_encoded_df], axis=1)

# Remove the original CoachName column from the combined_df_encoded
combined_df_encoded.drop("CoachName", axis=1, inplace=True)



In [105]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = new_df.select_dtypes(include="number")

# Assign the 'CoachName' column as the target variable
y = new_df["CoachName"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_train)

# Create a decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

param_dist = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"],
}


# Create a randomized search object
random_search_dt = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    n_jobs=-1,
    random_state=42,
)

# Start timing
start_time = time.time()

# Perform randomized search to find the best parameters
with tqdm(total=random_search_dt.n_iter, desc="Parameter optimization") as pbar:
    random_search_dt.fit(X_train, y_encoded)
    pbar.update(1)

# End timing
end_time = time.time()
total_time = end_time - start_time

# Print the best parameters found
print("Best parameters:", random_search_dt.best_params_)

# Get the best model
best_model_dt = random_search_dt.best_estimator_

# Make predictions on the testing data using the best model
y_pred_dt = best_model_dt.predict(X_test)

# Decode the predicted labels back to their original form
y_pred_decoded_dt = label_encoder.inverse_transform(y_pred_dt)

# Calculate the accuracy of the best model
accuracy_dt = accuracy_score(y_test, y_pred_decoded_dt)
print("Accuracy:", accuracy_dt)
print("Total time for tuning and modeling:", total_time, "seconds")

importance_scores = best_model_dt.feature_importances_

Parameter optimization:   1%|          | 1/100 [00:00<01:05,  1.51it/s]

Best parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'criterion': 'entropy'}
Accuracy: 0.8214285714285714
Total time for tuning and modeling: 0.6628599166870117 seconds





In [106]:
from sklearn.tree import DecisionTreeClassifier

importance_scores
# Create a DecisionTreeClassifier object
clf = DecisionTreeClassifier()

# Fit the model on your data
clf.fit(X, y)

# Get the feature importances
importances = clf.feature_importances_

# Create a dictionary of feature names and their importances
feature_importances = dict(zip(X.columns, importance_scores))

# Sort the features by importance in descending order
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

FGA3: 0.4613343579031037
TO: 0.31744726360432834
Blk: 0.1529393434825646
Score: 0.06827903501000342
TeamID: 0.0
FGM: 0.0
FGA: 0.0
FGM3: 0.0
FTM: 0.0
FTA: 0.0
OR: 0.0
DR: 0.0
Ast: 0.0
Stl: 0.0
PF: 0.0


#### Arizona


In [107]:
team_id = teams_df.loc[teams_df["TeamName"].str.contains("Arizona"), "TeamID"].values[0]

print(team_id)

1112


In [108]:
teams_df[teams_df["TeamID"] == team_id]

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
11,1112,Arizona,1985,2024


In [109]:
results_df = detailed_results_regular_df[
    (detailed_results_regular_df["WTeamID"] == team_id)
    | (detailed_results_regular_df["LTeamID"] == team_id)
]

In [110]:
coaches_df = team_coaches_df[team_coaches_df["TeamID"] == team_id]

In [111]:
combined_df = results_df.merge(
    coaches_df[["Season", "CoachName"]], on="Season", how="left"
)
combined_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,CoachName
0,2003,19,1112,107,1443,68,H,0,41,73,...,9,20,7,18,17,28,6,2,25,lute_olson
1,2003,23,1112,101,1319,66,H,0,38,75,...,2,7,10,24,18,26,2,2,21,lute_olson
2,2003,29,1112,91,1387,58,H,0,31,66,...,10,21,20,21,11,16,5,2,23,lute_olson
3,2003,33,1112,89,1361,81,A,0,30,67,...,9,13,18,23,18,18,9,4,25,lute_olson
4,2003,41,1112,73,1400,70,H,0,26,66,...,16,23,17,24,16,14,2,1,24,lute_olson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701,2024,117,1112,103,1332,83,H,0,39,64,...,7,9,10,17,14,12,8,3,17,tommy_lloyd
702,2024,122,1112,88,1417,65,A,0,28,54,...,13,18,4,19,7,9,6,5,21,tommy_lloyd
703,2024,124,1425,78,1112,65,H,0,28,57,...,11,16,11,22,18,17,3,2,18,tommy_lloyd
704,2024,129,1112,70,1425,49,N,0,26,57,...,3,6,8,20,9,17,6,7,19,tommy_lloyd


In [112]:
import pandas as pd

# Assuming combined_df is your existing DataFrame
# Create an empty list to store dictionaries
data = []

# Filter rows where WTeamID or LTeamID is team_id
team_id_df = combined_df[
    (combined_df["WTeamID"] == team_id) | (combined_df["LTeamID"] == team_id)
]

# Iterate over each row
for index, row in team_id_df.iterrows():
    if row["WTeamID"] == 1421:
        # Store values of columns starting with 'W' and rename them
        values = {
            col[1:]: row[col] for col in team_id_df.columns if col.startswith("W")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]
    else:
        # Store values of columns starting with 'L' and rename them
        values = {
            col[1:]: row[col] for col in team_id_df.columns if col.startswith("L")
        }
        # Add coach name column
        values["CoachName"] = row["CoachName"]

    # Append the values to the list
    data.append(values)

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data)

In [113]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = new_df.select_dtypes(include="number")

# Assign the 'CoachName' column as the target variable
y = new_df["CoachName"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_train)

# Create a decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

param_dist = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini", "entropy"],
}


# Create a randomized search object
random_search_dt = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    n_jobs=-1,
    random_state=42,
)

# Start timing
start_time = time.time()

# Perform randomized search to find the best parameters
with tqdm(total=random_search_dt.n_iter, desc="Parameter optimization") as pbar:
    random_search_dt.fit(X_train, y_encoded)
    pbar.update(1)

# End timing
end_time = time.time()
total_time = end_time - start_time

# Print the best parameters found
print("Best parameters:", random_search_dt.best_params_)

# Get the best model
best_model_dt = random_search_dt.best_estimator_

# Make predictions on the testing data using the best model
y_pred_dt = best_model_dt.predict(X_test)

# Decode the predicted labels back to their original form
y_pred_decoded_dt = label_encoder.inverse_transform(y_pred_dt)

# Calculate the accuracy of the best model
accuracy_dt = accuracy_score(y_test, y_pred_decoded_dt)
print("Accuracy:", accuracy_dt)
print("Total time for tuning and modeling:", total_time, "seconds")

importance_scores = best_model_dt.feature_importances_

Parameter optimization:   1%|          | 1/100 [00:00<00:57,  1.71it/s]

Best parameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'criterion': 'gini'}
Accuracy: 0.5352112676056338
Total time for tuning and modeling: 0.5885875225067139 seconds





In [114]:
from sklearn.tree import DecisionTreeClassifier

importance_scores
# Create a DecisionTreeClassifier object
clf = DecisionTreeClassifier()

# Fit the model on your data
clf.fit(X, y)

# Get the feature importances
importances = clf.feature_importances_

# Create a dictionary of feature names and their importances
feature_importances = dict(zip(X.columns, importance_scores))

# Sort the features by importance in descending order
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

FGA: 0.3170211492405361
Ast: 0.24933279214193704
TO: 0.2395260756645188
OR: 0.1270256911104015
TeamID: 0.06709429184260651
Score: 0.0
FGM: 0.0
FGM3: 0.0
FGA3: 0.0
FTM: 0.0
FTA: 0.0
DR: 0.0
Stl: 0.0
Blk: 0.0
PF: 0.0
