## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

#### Import the CSV Data as Pandas DataFrame

In [112]:
df = pd.read_csv('data/nhl_data_new_features.csv')

#### Show Top 5 Records

In [113]:
df.head()

Unnamed: 0,awayTeamCode,game_id,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,ANA,20001,LAK,1,0,2007,0,0,0,0,0,0,0,0,1,,
1,LAK,20002,ANA,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,MTL,20003,CAR,0,0,2007,0,0,0,0,0,0,0,0,1,,
3,ANA,20004,DET,1,0,2007,0,2,0,0,1,1,0,1,0,,1.0
4,OTT,20005,TOR,0,0,2007,0,0,0,0,0,0,0,0,0,,


In [114]:
# fill null values 
df.fillna(2,inplace=True)

In [115]:
df.drop('game_id',axis=1, inplace=True)

In [116]:
df.head()

Unnamed: 0,awayTeamCode,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,ANA,LAK,1,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
1,LAK,ANA,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,MTL,CAR,0,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
3,ANA,DET,1,0,2007,0,2,0,0,1,1,0,1,0,2.0,1.0
4,OTT,TOR,0,0,2007,0,0,0,0,0,0,0,0,0,2.0,2.0


#### Preparing X and Y variables

In [117]:
# Define features (X) and target variable (y)
X = df.drop(columns=['homeTeamWon'])
y = df['homeTeamWon']

In [118]:
X.tail()

Unnamed: 0,awayTeamCode,homeTeamCode,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
21739,FLA,EDM,1,2023,102,101,61,41,66,35,6,7,1,0.0,1.0
21740,FLA,EDM,1,2023,103,102,61,42,67,35,5,8,1,0.0,1.0
21741,EDM,FLA,1,2023,103,104,67,36,62,42,7,5,1,0.0,1.0
21742,FLA,EDM,1,2023,105,104,63,42,67,37,5,6,1,1.0,0.0
21743,EDM,FLA,1,2023,105,106,67,38,64,42,6,6,1,0.0,1.0


In [119]:
X.shape

(21744, 15)

#### Training and testing

In [120]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [121]:
# Define the column transformer for preprocessing
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['awayTeamCode', 'homeTeamCode']),
    ],
    remainder='passthrough'  # Leave the other columns unchanged
)

In [17]:
# Initialize models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest Tuned": RandomForestClassifier(
        n_estimators=50,        # Number of trees
        max_depth=5,            # Maximum depth
        min_samples_split=2,    # Minimum samples to split a node
        min_samples_leaf=10,    # Minimum samples in each leaf
        random_state=42)
}

# Store the results
results = {}

for model_name, model in models.items():
    # Create a pipeline with the ColumnTransformer and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', column_transformer),
        ('classifier', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Store the results
    results[model_name] = {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": cm
    }

# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Random Forest
Accuracy: 0.5509
F1 Score: 0.5471
Precision: 0.5838
Recall: 0.6406
Confusion Matrix:
[[ 863 1093]
 [ 860 1533]]

Model: Logistic Regression
Accuracy: 0.5670
F1 Score: 0.5473
Precision: 0.5824
Recall: 0.7534
Confusion Matrix:
[[ 663 1293]
 [ 590 1803]]

Model: Random Forest Tuned
Accuracy: 0.5721
F1 Score: 0.4767
Precision: 0.5665
Recall: 0.9469
Confusion Matrix:
[[ 222 1734]
 [ 127 2266]]



### Initial Results

None of these look really good, so let's try other ways

### Further attempts

In [125]:
df.head()

Unnamed: 0,awayTeamCode,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,ANA,LAK,1,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
1,LAK,ANA,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,MTL,CAR,0,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
3,ANA,DET,1,0,2007,0,2,0,0,1,1,0,1,0,2.0,1.0
4,OTT,TOR,0,0,2007,0,0,0,0,0,0,0,0,0,2.0,2.0


In [126]:
# Calculate total wins and total games played per team (home and away)
home_stats = df.groupby(['homeTeamCode']).agg(
    total_wins_home=('total_wins_home', 'sum'),
    total_games_home=('total_games_played_by_home', 'sum')
).reset_index()

away_stats = df.groupby(['awayTeamCode']).agg(
    total_wins_away=('total_wins_away', 'sum'),
    total_games_away=('total_games_played_by_away', 'sum')
).reset_index()

# Merge home and away stats
team_stats = pd.merge(home_stats, away_stats, left_on='homeTeamCode', right_on='awayTeamCode', how='outer')

# Calculate total wins and total games
team_stats['total_wins'] = team_stats['total_wins_home'].fillna(0) + team_stats['total_wins_away'].fillna(0)
team_stats['total_games'] = team_stats['total_games_home'].fillna(0) + team_stats['total_games_away'].fillna(0)

# Calculate average wins per season
team_stats['average_wins'] = team_stats['total_wins'] / team_stats['total_games'].replace(0, pd.NA)  # Avoid division by zero
team_stats['average_wins'] = team_stats['average_wins'].fillna(0)  # Fill NaN values

# Rank teams based on average wins (higher average wins get lower rank numbers)
team_stats['rank'] = team_stats['average_wins'].rank(method='min', ascending=False).astype(int)

# Label encoding (lower rank = better performance)
team_stats['encoded_team'] = team_stats['rank']

# Final output
final_stats = team_stats[['homeTeamCode', 'total_wins', 'total_games', 'average_wins', 'rank', 'encoded_team']].drop_duplicates()
print(final_stats)

   homeTeamCode  total_wins  total_games  average_wins  rank  encoded_team
0           ANA       28820        59197      0.486849    25            25
1           ARI       23708        54523      0.434826    31            31
2           BOS       39816        67151      0.592932     2             2
3           BUF       22267        52529      0.423899    32            32
4           CAR       29635        59097      0.501464    20            20
5           CBJ       24892        54955      0.452952    28            28
6           CGY       28282        56098      0.504153    17            17
7           CHI       32678        63064      0.518172    13            13
8           COL       32149        60243      0.533655     8             8
9           DAL       32517        62199      0.522790    12            12
10          DET       30591        60342      0.506960    15            15
11          EDM       26928        58143      0.463134    27            27
12          FLA       289

In [127]:
# Assuming 'final_stats' is your DataFrame containing the rankings
rank_dict = final_stats.set_index('homeTeamCode')['rank'].to_dict()

# Display the dictionary
print(rank_dict)


{'ANA': 25, 'ARI': 31, 'BOS': 2, 'BUF': 32, 'CAR': 20, 'CBJ': 28, 'CGY': 17, 'CHI': 13, 'COL': 8, 'DAL': 12, 'DET': 15, 'EDM': 27, 'FLA': 21, 'LAK': 16, 'MIN': 11, 'MTL': 23, 'NJD': 24, 'NSH': 9, 'NYI': 26, 'NYR': 6, 'OTT': 29, 'PHI': 22, 'PIT': 3, 'SEA': 30, 'SJS': 10, 'STL': 7, 'TBL': 5, 'TOR': 19, 'VAN': 14, 'VGK': 1, 'WPG': 18, 'WSH': 4}


In [128]:
df.head()

Unnamed: 0,awayTeamCode,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,ANA,LAK,1,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
1,LAK,ANA,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,MTL,CAR,0,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
3,ANA,DET,1,0,2007,0,2,0,0,1,1,0,1,0,2.0,1.0
4,OTT,TOR,0,0,2007,0,0,0,0,0,0,0,0,0,2.0,2.0


In [129]:
# Assuming 'df' is your original DataFrame and 'rank_dict' is your dictionary

# Replace team codes with their corresponding ranks
df['homeTeamCode'] = df['homeTeamCode'].map(rank_dict)
df['awayTeamCode'] = df['awayTeamCode'].map(rank_dict)

df


Unnamed: 0,awayTeamCode,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,25,16,1,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
1,16,25,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,23,20,0,0,2007,0,0,0,0,0,0,0,0,1,2.0,2.0
3,25,15,1,0,2007,0,2,0,0,1,1,0,1,0,2.0,1.0
4,29,19,0,0,2007,0,0,0,0,0,0,0,0,0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21739,21,27,0,1,2023,102,101,61,41,66,35,6,7,1,0.0,1.0
21740,21,27,1,1,2023,103,102,61,42,67,35,5,8,1,0.0,1.0
21741,27,21,0,1,2023,103,104,67,36,62,42,7,5,1,0.0,1.0
21742,21,27,1,1,2023,105,104,63,42,67,37,5,6,1,1.0,0.0


### Training and Testing for this 

In [176]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Prepare your features and target
X = df.drop('homeTeamWon',axis=1)
y = df['homeTeamWon']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

     # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print()


Model: Random Forest
Accuracy: 0.5585
F1 Score: 0.5559
Precision: 0.5552
Recall: 0.5585
Confusion Matrix:
[[ 907 1049]
 [ 871 1522]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Model: Logistic Regression
Accuracy: 0.5675
F1 Score: 0.5474
Precision: 0.5589
Recall: 0.5675
Confusion Matrix:
[[ 660 1296]
 [ 585 1808]]

Model: XGBoost
Accuracy: 0.5562
F1 Score: 0.5526
Precision: 0.5520
Recall: 0.5562
Confusion Matrix:
[[ 877 1079]
 [ 851 1542]]



In [131]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, make_scorer

# Prepare your features and target
X = df.drop('homeTeamWon', axis=1)
y = df['homeTeamWon']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and hyperparameters for tuning
models = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'class_weight': ['balanced', None]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear', 'saga'],
            'class_weight': ['balanced', None]
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'class_weight': ['balanced', None]
        }
    }
}

# Evaluate models using GridSearchCV
for model_name, config in models.items():
    grid_search = GridSearchCV(
        config['model'],
        config['params'],
        scoring=make_scorer(f1_score, average='weighted'),
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Predictions
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print results
    print(f"Model: {model_name}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print()


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Model: Random Forest
Best Parameters: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy: 0.5624
F1 Score: 0.5634
Precision: 0.5655
Recall: 0.5624
Confusion Matrix:
[[1073  883]
 [1020 1373]]

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Model: Logistic Regression
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'solver': 'liblinear'}
Accuracy: 0.5620
F1 Score: 0.5629
Precision: 0.5648
Recall: 0.5620
Confusion Matrix:
[[1069  887]
 [1018 1375]]

Fitting 5 folds for each of 54 candidates, totalling 270 fits


Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not used.

Parameters: { "class_weight", "use_label_encoder" } are not us

Model: XGBoost
Best Parameters: {'class_weight': 'balanced', 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.5746
F1 Score: 0.5647
Precision: 0.5682
Recall: 0.5746
Confusion Matrix:
[[ 797 1159]
 [ 691 1702]]



In [147]:
# Looks like the random forest performs the best which is balanced