# CSGO Win Probability Model Tuning
##### Peter Xenopoulos
##### January 31, 2020
This Jupyter notebook contains the code for model tuning. You will find the model fitting and tuning procedures for (1) Logistic Regression, (2) XGBoost and (3) CatBoost. Final models are saved to the `models` directory. You can load models using the `.load_model()` method. The data in the `data/` directory is an example of a few matches.

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss, accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

from catboost import Pool, CatBoost, CatBoostClassifier, cv

First we define our random seed and our cutoff date. Matches before and on the cutoff date comprise the training set. Matches after comprise the test set.

We also create two lists of columns. The first, without the `_CB` in the name, indicates variables that have been one hot encoded. This is necessary for XGBoost and Logistic Regression.

Finally, we read in our data.

In [None]:
# Set constants
RANDOM_STATE = 2020
CUTOFF_DATE = "2019-06-01"

# One Hot Encoded columns
COLS_ALL = ["MapName_de_dust2", "MapName_de_inferno", "MapName_de_mirage", "MapName_de_nuke", "MapName_de_overpass", "MapName_de_train", "MapName_de_vertigo", "BombSite_A", "BombSite_B", "BombSite_NotPlanted",
       "TicksSinceStart", "CTEqVal", "TEqVal", "TRemaining", "CTRemaining", "THpRemaining", "CTHpRemaining", "BombPlanted", "CTDistBombsiteA", "CTDistBombsiteB", "TDistBombsiteA", "TDistBombsiteB"]
# Non-encoded columns
COLS_CB_ALL = ["MapName", "BombSite", "TicksSinceStart", "CTEqVal", "TEqVal", "TRemaining", "CTRemaining", "THpRemaining", "CTHpRemaining", "BombPlanted", "CTDistBombsiteA", "CTDistBombsiteB", "TDistBombsiteA", "TDistBombsiteB"]

# Read data
df = pd.read_csv("data/example_matches.csv")

Then, we preprocess the data, creating some one hot encoded columns. Next, we split into the test and train sets.

In [None]:
# Preprocess columns
df["GameDate"] = pd.to_datetime(df["GameDate"])
map_names = df["MapName"]
df = pd.get_dummies(df, columns = ["MapName"], drop_first = False)
df["MapName"] = map_names
bombsites = df["BombSite"]
df = pd.get_dummies(df, columns = ["BombSite"], drop_first = False)
df["BombSite"] = bombsites

# Train/Test split
train_df = df[df["GameDate"] < CUTOFF_DATE]
test_df = df[df["GameDate"] >= CUTOFF_DATE]

Now, we split the data into the various design matrices, as well as dropping NAs. 

In [None]:
# Data used in logistic regression and XGBoost
train_df.dropna(subset = COLS_ALL, inplace=True)
X_train_ALL = train_df[COLS_ALL]
y_train = train_df["CTWin"]

test_df.dropna(subset = COLS_ALL, inplace=True)
X_test_ALL = test_df[COLS_ALL]
y_test = test_df["CTWin"]

# Data used in CatBoost
train_df.dropna(subset = COLS_CB_ALL, inplace=True)
X_train_cb_ALL = train_df[COLS_CB_ALL]
y_train_cb = train_df["CTWin"]

test_df.dropna(subset = COLS_CB_ALL, inplace=True)
X_test_cb_ALL = test_df[COLS_CB_ALL]
y_test_cb = test_df["CTWin"]

###### Print Function
We use this print function to print the train/test results of our best models.

In [None]:
def print_results(y_true_labels, y_pred_probs):
    """ Presents performance info
    """
    print("---------- LOG LOSS")
    print(log_loss(y_true_labels, y_pred_probs))
    print("---------- BRIER SCORE")
    print(brier_score_loss(y_true_labels, y_pred_probs[:,1]))
    print("---------- AUC")
    print(roc_auc_score(y_true_labels, y_pred_probs[:,1]))
    print("---------- ACCURACY")
    print(accuracy_score(y_true_labels, y_pred_probs[:,1] >= 0.5))

### Baseline Win Rate Results

This model uses the mean CT win percentage as its prediction.

In [None]:
test_df["BaselinePred"] = train_df.CTWin.mean()
print_results(test_df["CTWin"], np.column_stack((1 - test_df["BaselinePred"], test_df["BaselinePred"])))

### Map Average Results

This model uses the mean CT win percentage, by map, as its prediction.

In [None]:
def baseline_map(train, test):
    """ Create a baseline map performance
    """
    # Generate map win percentages
    map_win_rate = train.groupby("MapName").CTWin.mean().reset_index()
    map_win_rate.columns = ["MapName", "PredWinRate"]
    # Can't do a big join locally, so break it up by map
    test["PredMapWinRate"] = 0.5
    map_subset_df = []
    for map_name in test["MapName"].unique():
        subset_map = test[test["MapName"] == map_name]
        subset_map["PredMapWinRate"] = map_win_rate[map_win_rate["MapName"] == map_name].PredWinRate.values[0]
        map_subset_df.append(subset_map)
    test = pd.concat(map_subset_df)
    print_results(test["CTWin"], np.column_stack((1 - test["PredMapWinRate"], test["PredMapWinRate"])))

In [None]:
baseline_map(train_df, test_df)

### Logistic Regression

Here we present the logistic regression results on two feature sets, one using all features available and the other using no spatial features.

In [None]:
cols_scaled = ["TicksSinceStart", "CTEqVal", "TEqVal", "TRemaining", "CTRemaining", "THpRemaining", "CTHpRemaining", "BombPlanted", "CTDistBombsiteA", "CTDistBombsiteB", "TDistBombsiteA", "TDistBombsiteB"]

# Train
X_train_ALL_scaled = X_train_ALL.copy()
features_scaled = X_train_ALL_scaled[cols_scaled]
scaler = StandardScaler().fit(features_scaled.values)
features_scaled = scaler.transform(features_scaled.values)
X_train_ALL_scaled[cols_scaled] = features_scaled

# Test
X_test_ALL_scaled = X_test_ALL.copy()
features_scaled_test = X_test_ALL_scaled[cols_scaled]
features_scaled_test = scaler.transform(features_scaled_test.values)
X_test_ALL_scaled[cols_scaled] = features_scaled_test

lr_all = LogisticRegression(random_state=2020, penalty="none", solver="saga").fit(X_train_ALL_scaled, y_train)
lr_all_test_probs = lr_all.predict_proba(X_test_ALL_scaled)

print_results(y_test, lr_all_test_probs)

lr_filename = "models/logreg.model"
pickle.dump(lr_all, open(lr_filename, 'wb'))

### XGBoost

Here we present the XGBoost results on two feature sets, one using all features available and the other using no spatial features.

We search over the parameter space below, using grid search with 5-fold cross validation and a log loss scoring metric.

In [None]:
xgb_params = { 
 "max_depth"        : [6, 8, 10, 12, 14],
 "colsample_bytree" : [0.2, 0.4, 0.6, 0.8],
 "learning_rate"    : [0.01, 0.05, 0.1, 0.2],
 "min_child_weight" : [1, 3, 5, 7]}

xgb_kfold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
xgb = XGBClassifier(n_estimators=100, objective="binary:logistic", tree_method="gpu_hist", gpu_id=0)

xgb_cv = GridSearchCV(xgb, param_grid=xgb_params, cv=xgb_kfold, scoring="neg_log_loss", verbose=True)

In [None]:
xgb_cv.fit(X_train_ALL, y_train)
xgb_all = xgb_cv.best_estimator_
xgb_all.save_model("models/xgboost.model")
print(xgb_cv.best_params_)
xgb_all_test_probs = xgb_all.predict_proba(X_test_ALL)
print_results(y_test, xgb_all_test_probs)

### CatBoost

Here we present the XGBoost results on two feature sets, one using all features available and the other using no spatial features.

We search over the parameter space below, using grid search with 5-fold cross validation and a log loss scoring metric.

In [None]:
cb_params = {
    "learning_rate": [0.05, 0.1, 0.5, 1],
    "depth": [6, 8, 10, 12, 14],
    "l2_leaf_reg": [1, 3, 5, 7, 9]
}

cb_kfold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
cb_all_pool = Pool(data = X_train_cb_ALL, label = y_train_cb, cat_features = [0, 1])
cb_no_spatial_pool = Pool(data = X_train_cb_NO_SPATIAL, label = y_train_cb, cat_features = [0, 1])

cb = CatBoostClassifier(iterations=100, task_type="GPU", devices="0:1", custom_metric=["Logloss"])

cb_cv = GridSearchCV(cb, param_grid=cb_params, cv=cb_kfold, scoring="neg_log_loss", verbose=True)

In [None]:
cb_cv.fit(X_train_cb_ALL, y_train, cat_features = [0, 1])
cb_all = cb_cv.best_estimator_
cb_all.save_model("models/catboost.model", "cbm")
print(cb_cv.best_params_)
cb_all_test_probs = cb_all.predict_proba(X_test_cb_ALL)
print_results(y_test, cb_all_test_probs)