# Notebook setup

<img src="https://cdn.tiebreaker.com/wp-content/uploads/2019/10/butt-fumble-768x444.png" width="650px"/>

In [None]:
pip install nb-black -qq

In [None]:
%load_ext nb_black

In [None]:
pip install parfit -qq

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import feather
import lightgbm as lgb
import shap
import warnings
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import FeatureUnion
from lightgbm import LGBMClassifier
import parfit.parfit as pf
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option("display.max_rows", 100)


def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)


def rmse(x, y):
    return math.sqrt(((x - y) ** 2).mean())


def print_score(m, X_train, y_train, X_test, y_test, model_name):
    columns = ["model", "Precision train", "Precision test", "ROC test"]
    res = pd.Series(
        [
            model_name,
            average_precision_score(y_train, m.predict(X_train)),
            average_precision_score(y_test, m.predict(X_test)),
            roc_auc_score(y_test, m.predict(X_test)),
        ],
        index=columns,
    )
    if hasattr(m, "oob_score_"):
        print("RMSE oob", m.oob_score_)
    return res


def rf_feat_importance(m, df):
    return pd.DataFrame(
        {"cols": df.columns, "imp": m.feature_importances_}
    ).sort_values("imp", ascending=False)


def features_type(dataframe, number=0):
    if number == 0:
        feat = dataframe.select_dtypes(exclude=["object"]).columns
    else:
        feat = dataframe.select_dtypes(include=["object"]).columns
    return feat


def ifnone(a, b):
    return b if a is None else a


def make_date(df, date_field):
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)


def add_datepart(df, field_name, prefix, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    attr = [
        "Year",
        "Month",
        "Day",
        "Dayofweek",
        "Dayofyear",
        "Is_month_end",
        "Is_month_start",
        "Is_quarter_end",
        "Is_quarter_start",
        "Is_year_end",
        "Is_year_start",
    ]
    if time:
        attr = attr + ["Hour", "Minute", "Second"]
    for n in attr:
        df[prefix + n] = getattr(field.dt, n.lower())
    # Pandas removed `dt.week` in v1.1.10
    week = (
        field.dt.isocalendar().week
        if hasattr(field.dt, "isocalendar")
        else field.dt.week
    )
    df.insert(3, prefix + "Week", week)
    mask = ~field.isna()
    df[prefix + "Elapsed"] = np.where(
        mask, field.values.astype(np.int64) // 10 ** 9, None
    )
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    return df


def z_score(df):
    df_std = df.copy()
    for column in df_std.columns:
        if df_std[column].max() > 1:
            df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[
                column
            ].std()
        else:
            df_std[column] = df_std[column]

    return df_std


def feature_plot(df, col):
    perc_amt = pd.DataFrame(
        (
            df.groupby([col, "win"])["win"].count()
            / df.groupby([col])["win"].count()
            * 100
        ).unstack("win")
    )
    perc_amt = perc_amt.reset_index()
    perc_amt.rename(columns={0: "Lose", 1: "Win"}, inplace=True)

    plt.figure(figsize=(12, 4))
    plt.title("Win % and distribution against {}".format(col, fontsize=15))
    order0 = df[col].unique()
    p1 = sns.countplot(data=df, hue="win", x=col, palette="rocket", order=order0)
    p1.set_xticklabels(p1.get_xticklabels(), rotation=45)
    p2 = p1.twinx()
    p2 = sns.pointplot(x=col, y="Win", data=perc_amt, order=order0, color="red")

    plt.show()

# Data preprocesing

Importing games and play data, win/lose outcome. NFL 2018 season data, 16 teams, 221 game. The goal is to forecast if a home team will wins.

In [None]:
games_df = pd.read_csv("../input/nfl-big-data-bowl-2021/games.csv", parse_dates=True)
# players_df = pd.read_csv('../input/nfl-big-data-bowl-2021/players.csv', parse_dates = True)
plays_df = pd.read_csv("../input/nfl-big-data-bowl-2021/plays.csv", parse_dates=True)
stats_df = pd.read_csv(
    "../input/nfl-play-2009-2018/NFL Play by Play 2009-2018 (v5).csv", parse_dates=True
)
missing_df = pd.read_csv(
    "../input/missing-win/Untitled spreadsheet - Sheet1.csv",
    parse_dates=True,
    header=None,
)

## Play dataset
It is the main dataset used for modelling. Attributes explanation:

- **gameId**: Game identifier, unique (numeric)
- **playId**: Play identifier, not unique across games (numeric)
- **playDescription**: Description of play (text)
- **quarter**: Game quarter (numeric)
- **down**: Down (numeric)
- **yardsToGo**: Distance needed for a first down (numeric)
- **possessionTeam**: Team on offense (text), with a possession of a ball
- **playType**: Outcome of dropback: sack or pass (text)
- **yardlineSide**: 3-letter team code corresponding to line-of-scrimmage (text)
- **yardlineNumber**: Yard line at line-of-scrimmage (numeric)
- **offenseFormation**: Formation used by possession team (text)
- **personnelO**: Personnel used by offensive team (text)
- **defendersInTheBox**: Number of defenders in close proximity to line-of-scrimmage (numeric)
- **numberOfPassRushers**: Number of pass rushers (numeric)
- **personnelD**: Personnel used by defensive team (text)
- **typeDropback**: Dropback categorization of quarterback (text)
- **preSnapHomeScore**: Home score prior to the play (numeric)
- **preSnapVisitorScore**: Visiting team score prior to the play (numeric)
- **gameClock**: Time on clock of play (MM:SS)
- **absoluteYardlineNumber**: Distance from end zone for possession team (numeric)
- **penaltyCodes**: NFL categorization of the penalties that ocurred on the play. For purposes of this contest, the most important penalties are Defensive Pass Interference (DPI), Offensive Pass Interference (OPI), Illegal Contact (ICT), and Defensive Holding (DH). Multiple penalties on a play are separated by a ; (text)
- **penaltyJerseyNumber**: Jersey number and team code of the player commiting each penalty. Multiple penalties on a play are separated by a ; (text)
- **passResult**: Outcome of the passing play (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, text)
- **offensePlayResult**: Yards gained by the offense, excluding penalty yardage (numeric)
- **playResult**: Net yards gained by the offense, including penalty yardage (numeric)
- **epa**: Expected points added on the play, relative to the offensive team. Expected points is a metric that - -- estimates the average of every next scoring outcome given the play's down, distance, yardline, and time remaining (numeric)
- **isDefensivePI**: An indicator variable for whether or not a DPI penalty ocurred on a given play (TRUE/FALSE)[](http://)

In [None]:
display_all(plays_df.tail())

In [None]:
stats_df = stats_df[stats_df["game_date"] > "2018-09-01"].sort_values(
    ["game_id", "play_id"]
)
max_play = stats_df.groupby(["game_id"])["play_id"].max()
max_play2 = pd.DataFrame(max_play).reset_index()
stats_df1 = pd.merge(
    stats_df.reset_index(), max_play2, how="inner", on=["game_id", "play_id"]
)
stats_df1["win"] = np.where(
    stats_df1["total_home_score"] > stats_df1["total_away_score"], 1, 0
)
games_df = games_df.set_index("gameId").join(
    stats_df1[["game_id", "win"]].set_index("game_id")
)

In [None]:
missing_df.columns = [
    "gameId",
    "gameDate",
    "gameTimeEastern",
    "homeTeamAbbr",
    "visitorTeamAbbr",
    "week",
    "win",
]
missing_df.set_index("gameId", inplace=True)
missing_df.head(1)

In [None]:
games_df2 = games_df.dropna().copy()
games_df3 = games_df2.append(missing_df)

plays_full = pd.merge(
    games_df3.reset_index().rename(columns={"index": "gameId"}), plays_df, on="gameId"
)
plays_full.set_index(["gameId", "playId"], inplace=True)
plays_full.head(2)

In [None]:
plays_full.shape

Final dataset for modelling with 31 attributes and 20258 rows.

# Data modelling
Model goal is to predict which team will win a match in NFL league.
## Baseline models
Decision tree, Logistic regression and Random forest without feature engineering. The models are created with just a sample of attributes, like Home team, Away team, preSnapScore. Missing values are replaced with 0 since most of cases, values are missing because their are zeros. For example, penalty type is none if missing in the dataset.

In [None]:
model_score = pd.DataFrame(
    [], columns=["model", "Precision train", "Precision test", "ROC test"]
)
plays_full.sort_index(axis=0, inplace=True)

y = plays_full["win"]

keys = [
    "homeTeamAbbr",
    "visitorTeamAbbr",
    "passResult",
    "offensePlayResult",
    "preSnapHomeScore",
    "preSnapVisitorScore",
    "absoluteYardlineNumber",
    "yardsToGo",
    "possessionTeam",
]
X = plays_full[keys].copy()

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, features_type(X, number=0)),
        ("cat", categorical_transformer, features_type(X, number=1)),
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", DecisionTreeClassifier(max_depth=2)),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

clf.fit(X_train, y_train)
model_score = model_score.append(
    [
        print_score(
            clf, X_train, y_train, X_test, y_test, model_name="Base DecissionTree"
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
clf_lg = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", LogisticRegression(n_jobs=-1))]
)

clf_lg.fit(X_train, y_train)
model_score = model_score.append(
    [print_score(clf_lg, X_train, y_train, X_test, y_test, model_name="Base LogReg")],
    ignore_index=True,
)
model_score

In [None]:
clf_rf = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestClassifier(n_jobs=-1))]
)

clf_rf.fit(X_train, y_train)
model_score = model_score.append(
    [
        print_score(
            clf_rf, X_train, y_train, X_test, y_test, model_name="Base Random Forest"
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__C": [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf_lg, param_grid, cv=10)
grid_search.fit(X_train, y_train)

model_score = model_score.append(
    [
        print_score(
            grid_search,
            X_train,
            y_train,
            X_test,
            y_test,
            model_name="LogReg with GridSearch",
        )
    ],
    ignore_index=True,
)
model_score

## Feature engineering

Some attributes, like *'homeTeamAbbr'*, contains **NFL teams** abbreviations. In stead of **OneHotEncoding**, attributes are encoded by setting 2 if a team plays at home and 1 if a team plays away. This encoding decreases number of attributes comparing to **OneHotEncoding**. I consider it as a possible advantage since dataset is quite small and attributtes containing **NFL teams** are connected. Other attributes containing team abbreviation (like possession team) gets 1 if it is a home team.   
Other categorical variables are encoded with **CatBooster** since some attribustes have high cardinality and this method is proven to be best for forest type models.  
Custom trasformers are used in pipelines to transform and to model data.

In [None]:
plays_full.reset_index(inplace=True)
plays_full.sort_values(["gameId", "playId", "gameClock"])
y = plays_full["win"]
X0 = plays_full.drop(["win", "playDescription", "gameTimeEastern"], axis=1)
X0 = add_datepart(X0, "gameDate", "Date")

teams = X0["homeTeamAbbr"].unique()
homeTeam = pd.DataFrame()
visitorTeam = pd.DataFrame()
for i in teams:
    homeTeam[i] = [2 if j == i else 0 for j in X0["homeTeamAbbr"].values]
    visitorTeam[i] = [1 if j == i else 0 for j in X0["visitorTeamAbbr"].values]

playTeam = homeTeam + visitorTeam
playTeam["possessionTeam"] = [
    1 if i == j else 0 for i, j in zip(X0["homeTeamAbbr"], X0["possessionTeam"])
]
playTeam["yardlineSide"] = [
    1 if i == j else 0 for i, j in zip(X0["homeTeamAbbr"], X0["yardlineSide"])
]
X1 = X0.drop(
    [
        "possessionTeam",
        "homeTeamAbbr",
        "visitorTeamAbbr",
        "yardlineSide",
        "DateElapsed",
    ],
    axis=1,
).join(playTeam)

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class NumDataSelector(TransformerMixin, BaseEstimator):
    
    def fit(self, data, y=None):
        return self
        
    def transform(self, data, y=None):
        self.num_columns = features_type(data, number=0)
        return data[self.num_columns]
    
#     def get_feature_names(self):
#         return data[self.num_columns].columns.tolist()
    
class CatDataSelector(TransformerMixin, BaseEstimator):
    
    def fit(self, data, y=None):
        return self
        
    def transform(self, data, y=None):
        self.cat_columns = features_type(data, number=1)
        return data[self.cat_columns]
    
    def get_feature_names(self):
        return data[self.cat_columns].columns.tolist()
    
class CatEncoder(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        self._encoder = ce.CatBoostEncoder()
    
    def fit(self, data, y):
        self._encoder.fit(data, y)
        return self
        
    def transform(self, data, y=None):
        self.encoded_data = pd.DataFrame(self._encoder.transform(data), columns=data.columns).add_suffix('_cb')
        return self.encoded_data
    
    def get_feature_names(self):
        return self.encoded_data.columns.tolist()
            
    
class SimpleImputerWrapper(TransformerMixin, BaseEstimator):

    def __init__(self, fill_value):
        self._fill_value = fill_value
        self._imputer = SimpleImputer(fill_value=self._fill_value, strategy='constant')
    
    def fit(self, data, y=None):
        self._imputer.fit(data)
        return self
    
    def transform(self, data, y=None):
        imputed_data = self._imputer.transform(data)
        return pd.DataFrame(imputed_data, columns= data.columns)
    
    
class ZScaler(TransformerMixin, BaseEstimator):
    
    def fit(self, data, y=None):
        return self
    
    def transform(self, data, y=None):
        return z_score(data)

#     def get_feature_names(self):
#         return z_score(data).columns.tolist()
    
class FeatureSelector(TransformerMixin, BaseEstimator):
    
    def __init__(self, remove_feature_names=None):
        self._remove_feature_names = remove_feature_names 
        
    def fit(self, data, y=None):
        return self
        
    def transform(self, data, y=None):
        self._feature_names = data.columns.tolist()
        
        if self._remove_feature_names is not None:
            self._feature_names = [feat for feat in self._feature_names if feat not in self._remove_feature_names]
        
        return data[self._feature_names]
    
#     def get_feature_names(self):
#         return self._feature_names


In [None]:
drop_list = ["DateYear", "DateWeek", "playId", "gameId", "week"]

numeric_transformer = Pipeline(
    steps=[
        ("num_columns", NumDataSelector()),
        ("imputer", SimpleImputerWrapper(fill_value=0)),
        ("selector", FeatureSelector(remove_feature_names=drop_list)),
        ("scaler", ZScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("cat_columns", CatDataSelector()),
        ("imputer", SimpleImputerWrapper(fill_value="missing")),
        ("encoder", CatEncoder()),
    ]
)

full_pipeline = [
    (
        "prep",
        FeatureUnion(
            transformer_list=[
                ("num_pipeline", numeric_transformer),
                ("cat_pipeline", categorical_transformer),
            ]
        ),
    )
]

clf_rf1 = RandomForestClassifier(n_jobs=-1)
clf_rf1_steps = full_pipeline.copy()
clf_rf1_steps.append(("RF Classifier", clf_rf1))

clf_rf1_pipe = Pipeline(steps=clf_rf1_steps)

X_train, X_valid, y_train, y_valid = train_test_split(
    X1, y, test_size=0.3, shuffle=False
)

clf_rf1_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_rf1_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="Random Forest with Feat1",
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
col_names = list(features_type(X_valid.drop(drop_list, axis=1), number=0)) + list(
    features_type(X_valid.drop(drop_list, axis=1), number=1)
)

fi = rf_feat_importance(clf_rf1_pipe.named_steps["RF Classifier"], X_valid[col_names])
fi[:20]

In [None]:
numeric_transformer2 = Pipeline(
    steps=[
        ("num_columns", NumDataSelector()),
        ("imputer", SimpleImputerWrapper(fill_value=0)),
        # ('selector', FeatureSelector(remove_feature_names=drop_list)),
        ("scaler", ZScaler()),
    ]
)

part_pipeline = [
    (
        "prep",
        FeatureUnion(
            transformer_list=[
                ("num_pipeline", numeric_transformer2),
                ("cat_pipeline", categorical_transformer),
            ]
        ),
    )
]
part_pipeline1 = part_pipeline.copy()
part_pipeline1.append(("RF Classifier", clf_rf1))

part_pipe = Pipeline(steps=part_pipeline1)

In [None]:
imp_features = fi["cols"][:20].values
X_train, X_valid, y_train, y_valid = train_test_split(
    X1[imp_features], y, test_size=0.3, shuffle=False
)

part_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            part_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="Random Forest with Feat1 20",
        )
    ],
    ignore_index=True,
)
model_score

# Hyperparameter tunning
Parfit optimization is used for Random Forest hyperparameter tunning. The combination of *min_samples_leaf* and *max_features* gives the best score on validation set. 

In [None]:
paramGrid = ParameterGrid(
    {
        "min_samples_leaf": [15, 25, 50, 100, 150, 175, 200, 225, 250],
        "max_features": ["sqrt", "log2", 0.4, 0.5, 0.6, 0.7],
        "n_estimators": [100, 500],
        "n_jobs": [-1],
    }
)

X_train_trans = pd.DataFrame(part_pipe.named_steps["prep"].transform(X_train))
X_valid_trans = pd.DataFrame(part_pipe.named_steps["prep"].transform(X_valid))

best_model, best_score, all_models, all_scores = pf.bestFit(
    RandomForestClassifier,
    paramGrid,
    X_train_trans,
    y_train,
    X_valid_trans,
    y_valid,
    metric=roc_auc_score,
    scoreLabel="AUC",
)
print(best_model)

In [None]:
print(best_score)

In [None]:
clf_rf2 = RandomForestClassifier(
    n_estimators=500, max_features=0.7, min_samples_leaf=225, n_jobs=-1
)
clf_rf2_steps = part_pipeline.copy()
clf_rf2_steps.append(("RF2 Classifier", clf_rf2))

clf_rf2_pipe = Pipeline(steps=clf_rf2_steps)
clf_rf2_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_rf2_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="Random Forest Hypertunning with Feat1 20",
        )
    ],
    ignore_index=True,
)
model_score

Some feature engineering, hyperparameter tuning and oob scoring gives the best RF model so far. ROC is around 0.65 which is not a good result. ROC = 0.5 is a random model.

## LGB model
LBG is tried since it is considered to be one of the best in ML models class. Model hyperparameters are set based on previous models. Some hyperparameters search is done as well. 

In [None]:
d_train = lgb.Dataset(X_train_trans, label=y_train)
d_test = lgb.Dataset(X_valid_trans, label=y_valid)

params_lgb = {
    "max_bin": 512,
    "min_data_in_leaf": 225,
    "learning_rate": 0.1,
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 80,
    "verbose": -1,
    "min_data": 10,
    "boost_from_average": True,
    "bagging_fraction": 0.7,
}

model = lgb.train(
    params_lgb,
    d_train,
    10000,
    valid_sets=[d_test],
    early_stopping_rounds=50,
    verbose_eval=1000,
)

In [None]:
model_score = model_score.append(
    [
        print_score(
            model,
            X_train_trans,
            y_train,
            X_valid_trans,
            y_valid,
            model_name="LightGBM with Feat1 20",
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
min = float("inf")
pp = {}
iterations = 500
# for i in range(iterations):
# #     try:
#     params = {}
#     params['learning_rate'] = np.random.uniform(0, 0.4)
#     params['boosting_type'] = 'gbdt'
#     params['objective'] = 'binary'
#     params['metric'] = 'rmse'
#     params['sub_feature'] = np.random.uniform(0.3, 0.7)
#     params['num_leaves'] = np.random.randint(100, 300)
#     params['min_data'] = np.random.randint(150, 250)
#     params['max_depth'] = np.random.randint(3, 9)
#     clf = lgb.train(params, d_train, 10000, valid_sets=[d_test],
#                     early_stopping_rounds=50, verbose_eval=1000)
#     y_pred=clf.predict(X_valid_trans)
#     mae=rmse(y_pred,y_valid)
#     if mae < min:
#         min = mae
#         pp = params
# #     except:
# #         print('failed with')
# #         print(params)

# print('Minimum rmse is: ', min)
# print('Used params', pp)

In [None]:
pp = {
    "learning_rate": 0.325229431778417,
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "rmse",
    "sub_feature": 0.5937219626468773,
    "num_leaves": 281,
    "min_data": 217,
    "max_depth": 8,
}
model2 = lgb.train(
    pp, d_train, 10000, valid_sets=[d_test], early_stopping_rounds=50, verbose_eval=1000
)

In [None]:
model_score = model_score.append(
    [
        print_score(
            model2,
            X_train_trans,
            y_train,
            X_valid_trans,
            y_valid,
            model_name="LightGBM Hypertunning with Feat1 20",
        )
    ],
    ignore_index=True,
)
model_score

### SHAP values
To find out what is inside LGB model

In [None]:
explainer = shap.TreeExplainer(model)
expected_value = explainer.expected_value

select = range(20)
col_names2 = list(features_type(X_valid, number=0)) + list(
    features_type(X_valid, number=1)
)
X_valid_shap = pd.DataFrame(X_valid_trans)
X_valid_shap.columns = col_names2
features = X_valid_shap.iloc[select]
features_display = X_valid_shap.loc[features.index]

shap_values = explainer.shap_values(features)
shap_interaction_values = explainer.shap_interaction_values(features)

In [None]:
shap.decision_plot(expected_value[0], shap_values[0], features_display)

In [None]:
shap.summary_plot(shap_values, X_valid_shap, plot_type="bar")

In [None]:
shap_values = explainer.shap_values(X_valid_shap)
shap.dependence_plot("preSnapVisitorScore", shap_values[0], X_valid_shap)

In [None]:
shap.decision_plot(expected_value, shap_interaction_values, features)

### Plots

In [None]:
tmp = pd.crosstab(X0['homeTeamAbbr'], y, normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'Lose', 1:'Win'}, inplace=True)

plt.figure(figsize=(14,10))
plt.suptitle('Home Team Distributions', fontsize=22)

plt.subplot(311)
g = sns.boxenplot(x='homeTeamAbbr', y='preSnapHomeScore', hue='win', data=plays_full, order=tmp['homeTeamAbbr'])
plt.legend(title='Home team win', loc='upper center')

g.set_title("Home Team Scores", fontsize=19)
g.set_xlabel("Home Team Name", fontsize=17)
g.set_ylabel("Score", fontsize=17)

plt.subplot(312)
gt = sns.pointplot(x='homeTeamAbbr', y='Win', data=tmp, color='purple', legend=False)

gt.set_ylabel("% of Win", fontsize=16)
gt.set_title("Home Team Wins %", fontsize=19)
gt.set_xlabel("Home Team", fontsize=17)

plt.subplot(313)
g3 = sns.boxenplot(x='homeTeamAbbr', y='epa', hue='win', 
              data=plays_full, order=tmp['homeTeamAbbr'])
g3.set_title("Epa Distribuition by Team", fontsize=20)
g3.set_xlabel("Home Team Name", fontsize=17)
g3.set_ylabel("Epa", fontsize=17)

plt.subplots_adjust(hspace = 0.6, top = 0.85)

plt.show()


## XGBoost

In [None]:
clf_xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
clf_xgb_steps = full_pipeline.copy()
clf_xgb_steps.append(("XGBClassifier", clf_xgb))

clf_xgb_pipe = Pipeline(steps=clf_xgb_steps)
X_train, X_valid, y_train, y_valid = train_test_split(
    X1, y, test_size=0.3, shuffle=False
)

clf_xgb_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgb_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB with Feat1",
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
model2 = clf_xgb_pipe.named_steps['XGBClassifier']

plt.figure(figsize=(8,8))
xgb.plot_importance(model2, max_num_features=20).set_yticklabels(X_valid.columns.tolist())


In [None]:
dtrain = xgb.DMatrix(clf_xgb_pipe.named_steps["prep"].transform(X_train), label=y_train)
dvalid = xgb.DMatrix(clf_xgb_pipe.named_steps["prep"].transform(X_valid), label=y_valid)
params = {
    "objective": "reg:logistic",
    "colsample_bytree": 0.5,
    "learning_rate": 0.1,
    "max_depth": 10,
    "alpha": 10,
}

In [None]:
# gridsearch_params = [
#     (max_depth, min_child_weight)
#     for max_depth in range(2, 6)
#     for min_child_weight in range(20,81)
# ]

# num_boost_round = 48
# rmse = float("Inf")
# best_params = None
# for max_depth, min_child_weight in gridsearch_params:
#     params['max_depth'] = max_depth
#     params['min_child_weight'] = min_child_weight
#     cv_results = xgb.cv(
#         params,
#         dtrain,
#         num_boost_round=num_boost_round,
#         nfold=5,
#         early_stopping_rounds=10
#     )
#     mean_rmse = cv_results['test-rmse-mean'].min()
#     boost_rounds = cv_results['test-rmse-mean'].argmin()
#     if mean_rmse < rmse:
#         rmse = mean_rmse
#         best_params = (max_depth,min_child_weight)
# print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], rmse))


In [None]:
# params['max_depth'] = 5
# params['min_child_weight'] = 20
# gridsearch_params = [
#     (subsample, colsample)
#     for subsample in [i/10. for i in range(7,11)]
#     for colsample in [i/10. for i in range(4,7)]
# ]

# num_boost_round = 50
# rmse = float("Inf")
# best_params = None
# for subsample, colsample in reversed(gridsearch_params):
#     params['subsample'] = subsample
#     params['colsample_bytree'] = colsample

#     cv_results = xgb.cv(
#         params,
#         dtrain,
#         num_boost_round=num_boost_round,
#         nfold=5,
#         early_stopping_rounds=10
#     )
#     mean_rmse = cv_results['test-rmse-mean'].min()
#     boost_rounds = cv_results['test-rmse-mean'].argmin()
#     if mean_rmse < rmse:
#         rmse = mean_rmse
#         best_params = (subsample, colsample)
# print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], rmse))

In [None]:
# params['subsample'] = 1
# params['colsample_bytree'] = 0.6

# num_boost_round = 18
# rmse = float("Inf")
# best_params = None
# for eta in [.3, .2, .1, .05, .01, .005]:
#     params['eta'] = eta
#     cv_results = xgb.cv(
#         params,
#         dtrain,
#         num_boost_round=num_boost_round,
#         nfold=5,
#         early_stopping_rounds=10
#     )
#     mean_rmse = cv_results['test-rmse-mean'].min()
#     boost_rounds = cv_results['test-rmse-mean'].argmin()
#     if mean_rmse < rmse:
#         rmse = mean_rmse
#         best_params = eta
# print("Best params: {}, RMSE: {}".format(best_params, rmse))

In [None]:
params = {
    "max_depth": 5,
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "n_jobs": -1,
    "num_boost_round": 50,
    "min_child_weight": 20,
    "subsample": 1,
    "colsample_bytree": 0.6,
}

clf_xgb2 = XGBClassifier(**params)
clf_xgb2_steps = full_pipeline.copy()
clf_xgb2_steps.append(("XGBClassifier", clf_xgb2))

clf_xgb2_pipe = Pipeline(steps=clf_xgb2_steps)
clf_xgb2_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgb2_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB with Feat1 tunning",
        )
    ],
    ignore_index=True,
)
model_score

### Random Search CV

In [None]:
# gbm_param_grid = {
#     'n_estimators': [25, 50, 75, 100, 150, 200],
#     'max_depth': range(4, 6),
#     'colsample_bytree': [0.3, 0.4, 0.5, 0.6],
#     'learning_rate': [0.1, 0.2, 0.3],
#     'min_child_weight': range(10, 200)
# }
# gbm = xgb.XGBClassifier(n_estimators=10)
# randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm,
#                                     scoring='neg_mean_squared_error', n_iter=5, cv=4,
#                                    verbose=1)
# randomized_mse.fit(clf_xgb_pipe.named_steps['prep'].transform(X_train), y_train)

# print("Best parameters found: ", randomized_mse.best_params_)
# print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

### Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV

gbm_param_grid = {
    "n_estimators": range(99, 100),
    "max_depth": range(4, 5, 6),
    "colsample_bytree": np.arange(0.48, 0.53, 0.1),
    "learning_rate": [0.08, 0.1, 0.12],
    "min_child_weight": range(137, 160),
}

gbm = xgb.XGBClassifier()
# grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm,
#                         scoring='neg_mean_squared_error', cv=4, verbose=1)
# grid_mse.fit(clf_xgb_pipe.named_steps['prep'].transform(X_train), y_train)
# print("Best parameters found: ", grid_mse.best_params_)
# print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

In [None]:
clf_xgb3 = XGBRegressor(
    n_estimators=99,
    objective="reg:logistic",
    colsample_bytree=0.48,
    learning_rate=0.1,
    max_depth=4,
    min_child_weight=137,
)
clf_xgb3_steps = full_pipeline.copy()
clf_xgb3_steps.append(("XGBClassifier", clf_xgb3))
clf_xgb3_pipe = Pipeline(steps=clf_xgb3_steps)

clf_xgb3_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgb3_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB Grid search with Feat1",
        )
    ],
    ignore_index=True,
)
model_score

### Feature Engineering: Second Try
Create feature interaction and take out outliers.

In [None]:
X_new = pd.DataFrame()
X1.sort_values(["gameId", "playId"])
X_new["score_ratio"] = (X1.loc[:, "preSnapHomeScore"] + 0.01) / (
    X1.loc[:, "preSnapVisitorScore"] + 0.01
)
X_new["score_diff"] = X1.loc[:, "preSnapHomeScore"] - X1.loc[:, "preSnapVisitorScore"]
X_new["yardsToGo_adj"] = [
    k if i == j else 50 - k
    for i, j, k in zip(X0["homeTeamAbbr"], X0["possessionTeam"], X0["yardsToGo"])
]

penalty_team = X1["penaltyJerseyNumbers"].str.split(" ", n=1, expand=True)
X_new["penalty_team"] = penalty_team[0].where(penalty_team[0] == X0["homeTeamAbbr"], -1)
X_new["penalty_team"] = penalty_team[0].where(
    penalty_team[0] == X0["visitorTeamAbbr"], 1
)
X_new["jerseyNumber"] = penalty_team[1]

In [None]:
interactions_num = X1[
    [
        "yardsToGo",
        "down",
        "defendersInTheBox",
        "absoluteYardlineNumber",
        "numberOfPassRushers",
        "epa",
    ]
]
interactions_cat = X1[["playType", "offenseFormation", "typeDropback"]]
factor = X1["possessionTeam"]

interactions_num1 = interactions_num.where(factor == 1, -interactions_num)
interactions_cat1 = pd.DataFrame()
for i in interactions_cat:
    interactions_cat1[i + "Inter"] = (
        interactions_cat[i].astype("str") + "_" + factor.astype("str")
    )

In [None]:
drop_list = ["DateYear", "DateWeek", "playId", "gameId", "week"]
X3 = X0.join(X_new)
X3 = X3.drop(interactions_num.columns.tolist(), axis=1).join(interactions_cat1)
X3 = X3.join(interactions_num1)

clf_rf4_steps = full_pipeline.copy()
clf_rf4_steps.append(("RF2 Classifier", clf_rf2))
clf_rf4_pipe = Pipeline(steps=clf_rf4_steps)

X_train, X_valid, y_train, y_valid = train_test_split(
    X3, y, test_size=0.3, shuffle=False
)
clf_rf4_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_rf4_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="Random Forest with Feat2 tunning",
        )
    ],
    ignore_index=True,
)
model_score

In [None]:
clf_xgb2_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgb2_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB with Feat2 tunning",
        )
    ],
    ignore_index=True,
)
display_all(model_score)

In [None]:
clf_lgbm_steps = full_pipeline.copy()
clf_lgbm = LGBMClassifier(**params_lgb)
clf_lgbm_steps.append(("LGBM Classifier", clf_lgbm))
clf_lgbm_pipe = Pipeline(steps=clf_lgbm_steps)

clf_lgbm_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_lgbm_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="LGBM with Feat2",
        )
    ],
    ignore_index=True,
)
display_all(model_score)

In [None]:
col_names = list(features_type(X_valid.drop(drop_list, axis=1), number=0)) + list(
    features_type(X_valid, number=1)
)

fi = rf_feat_importance(
    clf_lgbm_pipe.named_steps["LGBM Classifier"], X_valid[col_names]
)
sns.barplot(fi["imp"][:20], fi["cols"][:20])

In [None]:
fi = rf_feat_importance(clf_xgb2_pipe.named_steps["XGBClassifier"], X_valid[col_names])
sns.barplot(fi["imp"][:20], fi["cols"][:20])

Checking newly created attributes by plotting. It is expected to find out if they might be important to the model.

In [None]:
X_plotting = X3.copy()
X_plotting["win"] = y.copy()

In [None]:
for i in ["playType", "offenseFormation", "typeDropback", "passResult"]:
    feature_plot(X_plotting, i)

### Add cummulative columns
A new brilliant idea: to add cummulative columns that model will have cummulative information about the game. This should ease to forecast if home team would win or lose. In the previous datasets a row contains just one 'play' information(many plays in one game). All time related attributes are dropped bicause tree based models do not grasp trend.

In [None]:
X5 = (
    X_plotting.sort_values(["gameId", "playId", "gameClock"])
    .reset_index(drop=True)
    .copy()
)
col_list4 = [c for c in X5.columns if c[:4] != "Date"]
y = X_plotting.sort_values(["gameId", "playId", "gameClock"])["win"]
X6 = X5[col_list4].copy()
num_cols = list(features_type(X6, number=0))
for i in num_cols:
    X6[i + "cum"] = X6.groupby(["gameId", "possessionTeam"])[i].cumsum(axis=0)

cat_cols = list(features_type(X6, number=1))
for i in cat_cols:
    X6[i + "cum"] = X6.groupby(["gameId", "possessionTeam"])[i].cumcount()

X7 = X6.drop(
    [
        "gameIdcum",
        "playIdcum",
        "possessionTeam",
        "homeTeamAbbr",
        "visitorTeamAbbr",
        "yardlineSide",
        "win",
        "wincum",
        "weekcum",
    ],
    axis=1,
).join(playTeam)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X7, y, test_size=0.3, shuffle=False
)
clf_rf4_pipe.fit(X_train.copy(), y_train)
clf_xgb2_pipe.fit(X_train.copy(), y_train)
clf_lgbm_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgb2_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB with Feat3",
        )
    ],
    ignore_index=True,
)
model_score = model_score.append(
    [
        print_score(
            clf_rf4_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="Random Forest with Feat3",
        )
    ],
    ignore_index=True,
)
model_score = model_score.append(
    [
        print_score(
            clf_lgbm_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="LGBM with Feat3",
        )
    ],
    ignore_index=True,
)
display_all(model_score)

In [None]:
col_names = list(
    features_type(X_valid.drop(["gameId", "playId", "week"], axis=1), number=0)
) + list(features_type(X_valid, number=1))

fi_LGBM = rf_feat_importance(
    clf_lgbm_pipe.named_steps["LGBM Classifier"], X_valid[col_names]
)
fi_RF = rf_feat_importance(
    clf_rf4_pipe.named_steps["RF2 Classifier"], X_valid[col_names]
)
fi_XGB = rf_feat_importance(
    clf_xgb2_pipe.named_steps["XGBClassifier"], X_valid[col_names]
)

plt.figure(figsize=(20, 16))
plt.suptitle("Feature importance", fontsize=22)

plt.subplot(311)
g1 = sns.barplot(fi_LGBM["imp"][:20], fi_LGBM["cols"][:20])
g1.set_title("LGB model", fontsize=20)
g1.set_xlabel("Importance")

plt.subplot(312)
g2 = sns.barplot(fi_RF["imp"][:20], fi_RF["cols"][:20])
g2.set_title("Random Forest model", fontsize=20)
g2.set_xlabel("Importance")

plt.subplot(313)
g3 = sns.barplot(fi_XGB["imp"][:20], fi_XGB["cols"][:20])
g3.set_title("XGB model", fontsize=20)
g3.set_xlabel("Importance")

In [None]:
from sklearn.model_selection import KFold, TimeSeriesSplit, StratifiedKFold
from sklearn.metrics import make_scorer
from hyperopt import hp, fmin, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

import time

X_train_trans = clf_rf4_pipe.named_steps["prep"].transform(X_train)
X_valid_trans = clf_rf4_pipe.named_steps["prep"].transform(X_valid)


def objective(params):
    time1 = time.time()
    params = {
        "max_depth": int(params["max_depth"]),
        "gamma": "{:.3f}".format(params["gamma"]),
        "subsample": "{:.2f}".format(params["subsample"]),
        "reg_alpha": "{:.3f}".format(params["reg_alpha"]),
        "reg_lambda": "{:.3f}".format(params["reg_lambda"]),
        "learning_rate": "{:.3f}".format(params["learning_rate"]),
        "num_leaves": "{:.3f}".format(params["num_leaves"]),
        "colsample_bytree": "{:.3f}".format(params["colsample_bytree"]),
        "min_child_samples": "{:.3f}".format(params["min_child_samples"]),
        "feature_fraction": "{:.3f}".format(params["feature_fraction"]),
        "bagging_fraction": "{:.3f}".format(params["bagging_fraction"]),
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 3
    count = 1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=False)
    y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    for tr_idx, val_idx in skf.split(X_train_trans, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, verbose=True, tree_method="gpu_hist", **params,
        )
        X_tr, X_vl = X_train_trans[tr_idx, :], X_train_trans[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        clf.fit(X_tr, y_tr)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        score_mean += score
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    print(f"Mean ROC_AUC: {score_mean / FOLDS}")
    del clf, X_tr, X_vl, y_tr, y_vl, score
    return {"loss": -(score_mean / FOLDS), "status": STATUS_OK}


space = {
    "max_depth": hp.choice("max_depth", list(range(3, 7))),
    "reg_alpha": hp.uniform("reg_alpha", 0.01, 0.4),
    "reg_lambda": hp.uniform("reg_lambda", 0.01, 0.4),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.4),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.9),
    "gamma": hp.uniform("gamma", 0.01, 0.7),
    "num_leaves": hp.choice("num_leaves", list(range(20, 250, 10))),
    "min_child_samples": hp.choice("min_child_samples", list(range(100, 250, 10))),
    "subsample": hp.choice("subsample", [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
    "feature_fraction": hp.uniform("feature_fraction", 0.4, 0.8),
    "bagging_fraction": hp.uniform("bagging_fraction", 0.4, 0.9),
}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=250)

best_params = space_eval(space, best)

In [None]:
best_params

In [None]:
clf_xgbg = XGBClassifier(**best_params)
clf_xgbg_steps = full_pipeline.copy()
clf_xgbg_steps.append(("XGBClassifier", clf_xgbg))

clf_xgbg_pipe = Pipeline(steps=clf_xgbg_steps)

clf_xgbg_pipe.fit(X_train.copy(), y_train)

In [None]:
model_score = model_score.append(
    [
        print_score(
            clf_xgbg_pipe,
            X_train,
            y_train,
            X_valid,
            y_valid,
            model_name="XGB hyperopt with Feat3",
        )
    ],
    ignore_index=True,
)
display_all(model_score)

In [None]:
fi_XGB_hyp = rf_feat_importance(
    clf_xgbg_pipe.named_steps["XGBClassifier"], X_valid[col_names]
)

plt.figure()
sns.barplot(fi_XGB_hyp["imp"][:20], fi_XGB_hyp["cols"][:20])

In [None]:
import scipy
from scipy.cluster import hierarchy as hc

corr = np.round(scipy.stats.spearmanr(X_train_trans).correlation, 4)
corr_condensed = hc.distance.squareform(1 - corr)
z = hc.linkage(corr_condensed, method="average")
fig = plt.figure(figsize=(16, 30))
dendrogram = hc.dendrogram(
    z, labels=X_train.columns, orientation="left", leaf_font_size=16
)
plt.show()

<img src="https://static0.thesportsterimages.com/wordpress/wp-content/uploads/2018/06/NFL11.jpg?q=50&fit=crop&w=740&h=370" width="850px"/>