In [2]:
import os
import pandas as pd
import numpy as np
from random import shuffle
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import (
    GridSearchCV, GroupShuffleSplit, cross_val_score)
from sklearn.metrics import accuracy_score

# Train on Word Embeddings
## Get Embeddings

In [3]:
import re
import fasttext
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # remove links
    tweet = re.sub(r"http\S+|https\S+|www\S+", '', tweet)
    # remove all non-alphanumeric characters, except for hashtags, mentions, emojis, the dollar sign, and the ampersand
    tweet = re.sub(r"[^\w\s#@\&\$\u00a9\u00ae\u2000-\u3300\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff]|_+", '', tweet)
    # isolate emojis
    tweet = re.sub(r"(\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff)", r" \1 ", tweet)
    return tweet
def get_embedding(tweet, embedding_model):
    tweet = preprocess_tweet(tweet)
    tweet = re.sub("\n|\r", ' ', tweet)
    return embedding_model.get_sentence_vector(tweet)

In [4]:
embedding_model = fasttext.load_model("data/fasttext_model.bin")

In [5]:
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        df = pd.read_csv(os.path.join(folder, filename))
        df.drop_duplicates(subset="Tweet", inplace=True)
        data.append(df)
    return pd.concat(data)

In [6]:
train_df = load_data("data/train_tweets")
test_df = load_data("data/eval_tweets")

In [7]:
train_df["Embedding"] = train_df["Tweet"].apply(lambda x: get_embedding(x, embedding_model))
test_df["Embedding"] = test_df["Tweet"].apply(lambda x: get_embedding(x, embedding_model))

In [8]:
train_df_backup = train_df.copy()
test_df_backup = test_df.copy()

In [97]:
train_df = train_df_backup.copy()
test_df = test_df_backup.copy()

In [108]:
def get_features_embedding(df, dim=100, label=True):
    df["MatchID"] = df["MatchID"].apply(int)
    df["PeriodID"] = df["PeriodID"].apply(int)
    df.sort_values(by=["MatchID", "PeriodID"], inplace=True)

    features_df = pd.DataFrame(df["ID"].unique())
    features_df.rename(columns={0: "ID"}, inplace=True)
    if label:
        features_df["EventType"] = (df.groupby("ID", sort=False)
                                    ["EventType"].first().values)

    embed_avgs = np.empty((0, dim))
    embed_stds = np.empty((0, dim))
    embed_prev = np.empty((0, dim))
    embed_next = np.empty((0, dim))
    last_period = df.groupby("MatchID", sort=False)["PeriodID"].last()
    grouped_df = df.groupby("ID", sort=False)
    for (i, group) in enumerate(grouped_df.groups):
        group_df = grouped_df.get_group(group).reset_index()
        embeddings = np.array(group_df["Embedding"].to_list())
        embed_avg = np.mean(embeddings, axis=0)
        embed_std = np.std(embeddings, axis=0)
        embed_avgs = np.vstack((embed_avgs, embed_avg))
        embed_stds = np.vstack((embed_stds, embed_std))
        if group_df["PeriodID"][0] == 0:
            count = 0
            embed_prev = np.vstack((embed_prev, np.zeros(dim)))
        else:
            count += 1
            embed_next = np.vstack((embed_next, embed_avg))
        if group_df["PeriodID"][0] == last_period[group_df["MatchID"][0]]:
            embed_next = np.vstack((embed_next, np.zeros(dim)))
            match_embed = embed_avgs[i-count:i+1]
            match_embed_avg = np.mean(match_embed, axis=0)
            match_embed_std = np.std(match_embed, axis=0)
            embed_avgs[i-count:i+1] -= match_embed_avg
            embed_stds[i-count:i+1] -= match_embed_std
            embed_prev[i-count+1:i+1] -= match_embed_avg
            embed_next[i-count:i] -= match_embed_avg
        else:
            embed_prev = np.vstack((embed_prev, embed_avg))
    features_df["EmbedAVG"] = list(embed_avgs)
    features_df["EmbedSTD"] = list(embed_stds)
    features_df["EmbedPrev"] = list(embed_prev)
    features_df["EmbedNext"] = list(embed_next)

    features_df["MatchID"] = (df.groupby("ID", sort=False)["MatchID"]
                              .first().to_numpy())
    tweet_per_match = (df.groupby("MatchID", sort=False)["Tweet"]
                       .count().reset_index())
    match_to_idx = {match: idx for idx, match
                    in enumerate(tweet_per_match["MatchID"])}
    match_tweet_count = features_df["MatchID"].apply(
        lambda x: tweet_per_match["Tweet"][match_to_idx[x]])
    features_df["TweetRatio"] = (df.groupby("ID", sort=False)["Tweet"]
                                 .count().values / match_tweet_count)

    return features_df

In [109]:
train_df = get_features_embedding(train_df)
test_df = get_features_embedding(test_df, label=False)

In [111]:
X_train = np.hstack((np.array(train_df["EmbedAVG"].to_list()),
                     np.array(train_df["EmbedSTD"].to_list()),
                     np.array(train_df["EmbedPrev"].to_list()),
                     np.array(train_df["EmbedNext"].to_list()),
                     train_df["TweetRatio"].to_numpy().reshape(-1, 1)))
y_train = train_df["EventType"].to_numpy()

X_test = np.hstack((np.array(test_df["EmbedAVG"].to_list()),
                    np.array(test_df["EmbedSTD"].to_list()),
                    np.array(test_df["EmbedPrev"].to_list()),
                    np.array(test_df["EmbedNext"].to_list()),
                    test_df["TweetRatio"].to_numpy().reshape(-1, 1)))

# standardize features
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

## Model Selection

In [112]:
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [113]:
names = ["Logistic Regression", "SVC", "Linear SVC",
         "QDA", "KNN", "Gaussian NB", "Random Forest",
         "AdaBoost", "Gradient Boosting", "MLP", "XGBoost"]
models = [
    LogisticRegression(max_iter=1000),
    SVC(),
    LinearSVC(max_iter=10000),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier((100, 100), max_iter=1000),
    XGBClassifier(objective="binary:logistic", booster="gblinear",
                  eval_metric=accuracy_score)
]

In [114]:
for name, model in zip(names, models):
    scores = cross_val_score(
        model, X_train, y_train, groups=train_df["MatchID"].values,
        scoring="accuracy", n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
    print(name, ": mean score = ", scores.mean(),
          ", score std = ", scores.std(), sep = '')

Logistic Regression: mean score = 0.6785622408817256, score std = 0.04023852054014571
SVC: mean score = 0.724685729065036, score std = 0.05599299605179374
Linear SVC: mean score = 0.6674306606539704, score std = 0.040091038945186036
QDA: mean score = 0.5582828732903635, score std = 0.035320658316686314
KNN: mean score = 0.6866242218449349, score std = 0.05153569357039164
Gaussian NB: mean score = 0.6592016276276739, score std = 0.06709690019912476
Random Forest: mean score = 0.7363986689099044, score std = 0.035725206628006316




AdaBoost: mean score = 0.6949830092674395, score std = 0.033545236624923626
Gradient Boosting: mean score = 0.747500749026266, score std = 0.032409581452497484
MLP: mean score = 0.7163077614485783, score std = 0.0502150802276802
XGBoost: mean score = 0.6763845846553326, score std = 0.0476669092839954


## Hyperparameter Tuning

In [115]:
scores = cross_val_score(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42),
    X_train, y_train, groups=train_df["MatchID"].to_numpy(),
    scoring="accuracy", n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
print(f"Accuracy: {scores.mean()} (+/- {scores.std() * 2})")

Accuracy: 0.7469339474068326 (+/- 0.06826551777702064)


In [117]:
param_grid = {
    "max_depth": np.linspace(3, 8, 6, dtype=np.int64),
    "min_child_weight": np.linspace(1, 10, 10, dtype=np.int64)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", tree_method="exact",
                  booster="gbtree", eval_metric=accuracy_score, random_state=42),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7528731239132718 +/- 0.058693177475172185


{'max_depth': 6, 'min_child_weight': 4}

In [118]:
param_grid = {
    "subsample": np.linspace(0.5, 1.0, 6),
    "colsample_bytree": np.linspace(0.5, 1.0, 6)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=6, min_child_weight=4),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7528731239132718 +/- 0.058693177475172185


{'colsample_bytree': 1.0, 'subsample': 1.0}

In [119]:
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "n_estimators": [50, 100, 300, 500, 800, 1000]
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=6, min_child_weight=4),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7528731239132718 +/- 0.058693177475172185


{'learning_rate': 0.3, 'n_estimators': 100}

In [120]:
param_grid = {
    "reg_alpha": np.logspace(-1, 2, 4),
    "reg_lambda": np.logspace(-1, 2, 4)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=6, min_child_weight=4,
                  learning_rate=0.3, n_estimators=100),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7541821657486294 +/- 0.0497319277381883


{'reg_alpha': 10.0, 'reg_lambda': 100.0}

In [122]:
param_grid = {
    "reg_alpha": np.linspace(5, 30, 6),
    "reg_lambda": np.linspace(50, 300, 6)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=6, min_child_weight=4,
                  learning_rate=0.3, n_estimators=100),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7647215926987225 +/- 0.052761911255428146


{'reg_alpha': 25.0, 'reg_lambda': 50.0}

In [None]:
scores = cross_val_score(
    grid_search.best_estimator_, X_train, y_train,
    groups=train_df["MatchID"].to_numpy(), scoring="accuracy",
    n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
print(f"Accuracy: {scores.mean()} +/- {scores.std() * 2}")

Accuracy: 0.7335096169850989 (+/- 0.048298032901688645)


## Predict

In [123]:
xgbclf = XGBClassifier(
    objective="binary:logistic", eval_metric=accuracy_score,
    tree_method="exact", booster="gbtree", random_state=42,
    max_depth=6, min_child_weight=4, # subsample=0.7, colsample_bytree=0.5,
    reg_alpha=25.0, reg_lambda=50.0, learning_rate=0.3, n_estimators=100)
xgbclf.fit(X_train, y_train)

In [124]:
xgbclf.feature_importances_

array([5.7936427e-03, 6.3698746e-02, 1.7936929e-03, 7.1609643e-04,
       1.1503189e-02, 0.0000000e+00, 1.2814348e-02, 0.0000000e+00,
       1.3128836e-03, 0.0000000e+00, 1.4437318e-03, 1.0033710e-02,
       2.2017707e-03, 6.8314904e-03, 3.4913097e-03, 2.2605036e-03,
       0.0000000e+00, 5.2046031e-03, 1.0420449e-02, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.3930235e-05,
       1.4216763e-03, 0.0000000e+00, 4.7631650e-03, 0.0000000e+00,
       1.2580832e-02, 1.0736532e-03, 2.5150136e-03, 0.0000000e+00,
       4.6474198e-03, 2.6539455e-03, 0.0000000e+00, 1.9949262e-03,
       1.1283332e-02, 0.0000000e+00, 3.1016346e-03, 2.2061539e-03,
       1.5957153e-03, 6.0652304e-03, 0.0000000e+00, 1.2113618e-03,
       0.0000000e+00, 0.0000000e+00, 1.7270067e-03, 1.8506893e-03,
       1.2026102e-03, 4.2923670e-03, 7.5421385e-06, 1.8395314e-02,
       1.9819008e-03, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 1.6375838e-02, 0.0000000e+00, 0.0000000e

In [125]:
test_df_copy = test_df.copy()

In [95]:
test_df = test_df_copy.copy()

In [127]:
test_df["EventType"] = xgbclf.predict(X_test)
test_df.drop(columns=["MatchID", "TweetRatio", "EmbedAVG",
                      "EmbedSTD", "EmbedPrev", "EmbedNext"],
             inplace=True)
test_df["MatchID"] = test_df["ID"].apply(lambda x: int(x.split("_")[0]))
test_df["PeriodID"] = test_df["ID"].apply(lambda x: int(x.split("_")[1]))
test_df.sort_values(by=["MatchID", "PeriodID"], inplace=True)
test_df.drop(columns=["MatchID", "PeriodID"], inplace=True)
test_df["EventType"] = test_df["EventType"].apply(lambda x: float(x))
test_df.to_csv("data/xgboost_pred.csv", index=False)

# Train on Tweet Classifier Output
## Extract features

In [3]:
def get_features_confidence(df, label=True):
    df.sort_values(by="ID", inplace=True)
    features_df = df.groupby("ID").agg({"Confidence": "median"}).reset_index()
    features_df["ConfMedian"] = features_df["Confidence"]
    features_df["ConfRange"] = df.groupby("ID")["Confidence"].apply(
        lambda conf: np.percentile(conf, 75) - np.percentile(conf, 25)
        ).reset_index()["Confidence"]
    df["MatchID"] = df["ID"].apply(lambda x: x.split("_")[0])
    features_df["MatchID"] = (df.groupby("ID")["MatchID"].first()
                              .reset_index()["MatchID"])
    tweet_per_match = df.groupby("MatchID")["Confidence"].count().reset_index()
    match_to_idx = {match: idx for idx, match
                    in enumerate(tweet_per_match["MatchID"])}
    match_tweet_count = df["MatchID"].apply(
        lambda x: tweet_per_match["Confidence"][match_to_idx[x]])
    features_df["TweetRatio"] = (
        df.groupby("ID")["Confidence"].count().reset_index()["Confidence"]
        / match_tweet_count)
    features_df["ID"] = features_df["ID"].unique()
    if label:
        features_df["EventType"] = \
            df.groupby("ID")["EventType"].first().reset_index()["EventType"]
    features_df.drop(columns=["Confidence"], inplace=True)
    return features_df

In [15]:
train_df = get_features_confidence(
    pd.read_csv("data/distilbert_single_train_full.csv"))
test_df = get_features_confidence(
    pd.read_csv("data/distilbert_single_pred_full.csv"), label=False)
X_train = train_df.drop(columns=["ID", "EventType", "MatchID"]).to_numpy()
y_train = train_df["EventType"].to_numpy()
X_test = test_df.drop(columns=["ID", "MatchID"]).to_numpy()

# standardize features
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

## Model Selection

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [17]:
names = ["Logistic Regression", "SVC", "Linear SVC",
         "QDA", "KNN", "Gaussian NB", "Random Forest",
         "AdaBoost", "Gradient Boosting", "MLP", "XGBoost"]
models = [
    LogisticRegression(max_iter=1000),
    SVC(),
    LinearSVC(max_iter=10000),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier((400, 400), max_iter=1000),
    XGBClassifier(objective="binary:logistic", booster="gblinear",
                  eval_metric=accuracy_score)
]

In [18]:
for name, model in zip(names, models):
    scores = cross_val_score(
        model, X_train, y_train, groups=train_df["MatchID"].values,
        scoring="accuracy", n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
    print(name, ": mean score = ", scores.mean(),
          ", score std = ", scores.std(), sep = '')

Logistic Regression: mean score = 0.8507467432599822, score std = 0.026361704709923876
SVC: mean score = 0.8684004217620341, score std = 0.009394563764481322
Linear SVC: mean score = 0.8471367567552589, score std = 0.02633978156925499
QDA: mean score = 0.8287924637054649, score std = 0.026200792338179027
KNN: mean score = 0.8393576580845575, score std = 0.017835676482537394
Gaussian NB: mean score = 0.8299183909861529, score std = 0.02243286980749796
Random Forest: mean score = 0.8574941321872046, score std = 0.027888747829485413
AdaBoost: mean score = 0.8748695502775261, score std = 0.012172318590108724




Gradient Boosting: mean score = 0.8823005567148007, score std = 0.035650222770698806
MLP: mean score = 0.8651652686446807, score std = 0.03478798230189002
XGBoost: mean score = 0.8484727891439228, score std = 0.023589494869869183


## Hyperparameter Tuning

In [19]:
scores = cross_val_score(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42),
    X_train, y_train, groups=train_df["MatchID"].to_numpy(),
    scoring="accuracy", n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
print(f"Accuracy: {scores.mean()} (+/- {scores.std() * 2})")

Accuracy: 0.8272435412492692 (+/- 0.04595405021029534)


In [22]:
param_grid = {
    "max_depth": np.linspace(1, 10, 10, dtype=np.int64),
    "min_child_weight": np.linspace(1, 10, 10, dtype=np.int64)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", tree_method="exact",
                  booster="gbtree", eval_metric=accuracy_score, random_state=42),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.8836917479292925 +/- 0.042290842415051356


{'max_depth': 1, 'min_child_weight': 5}

In [43]:
param_grid = {
    "subsample": np.linspace(0.5, 1.0, 6),
    "colsample_bytree": np.linspace(0.5, 1.0, 6)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=1, min_child_weight=5),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.8904307819778479 +/- 0.045327490676506724


{'colsample_bytree': 0.5, 'subsample': 0.9}

In [44]:
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "n_estimators": [50, 100, 300, 500, 800, 1000]
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=1, min_child_weight=5,
                  subsample=0.9, colsample_bytree=0.5),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.8970732430507639 +/- 0.0643504313344076


{'learning_rate': 0.3, 'n_estimators': 50}

In [51]:
param_grid = {
    "reg_alpha": np.logspace(-4, 3, 8),
    "reg_lambda": np.logspace(-4, 3, 8)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=1, min_child_weight=5,
                  subsample=0.9, colsample_bytree=0.5),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.9018910568159461 +/- 0.08306698066075276


{'reg_alpha': 10.0, 'reg_lambda': 0.0001}

In [None]:
param_grid = {
    "reg_alpha": np.linspace(5, 30, 6),
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=1, min_child_weight=5,
                  subsample=0.9, colsample_bytree=0.5),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.9018910568159461 +/- 0.08306698066075276


{'reg_alpha': 10.0}

In [77]:
scores = cross_val_score(
    grid_search.best_estimator_, X_train, y_train,
    groups=train_df["MatchID"].to_numpy(), scoring="accuracy",
    n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
print(f"Accuracy: {scores.mean()} (+/- {scores.std() * 2})")

Accuracy: 0.9018910568159461 (+/- 0.08306698066075276)


## Predict

In [59]:
xgbclf = XGBClassifier(
    objective="binary:logistic", eval_metric=accuracy_score,
    tree_method="exact", booster="gbtree", random_state=42,
    max_depth=1, min_child_weight=5, subsample=0.9, colsample_bytree=0.5)
xgbclf.fit(X_train, y_train)

In [60]:
xgbclf.feature_importances_

array([0.80947196, 0.10028598, 0.09024208], dtype=float32)

In [61]:
test_df_copy = test_df.copy()

In [73]:
test_df = test_df_copy.copy()

In [74]:
test_df["EventType"] = xgbclf.predict(X_test)
# test_df.drop(columns=["ConfMedian", "ConfRange", "TweetRatio"], inplace=True)
test_df["MatchID"] = test_df["ID"].apply(lambda x: int(x.split("_")[0]))
test_df["PeriodID"] = test_df["ID"].apply(lambda x: int(x.split("_")[1]))
test_df.sort_values(by=["MatchID", "PeriodID"], inplace=True)
test_df.drop(columns=["MatchID", "PeriodID"], inplace=True)
test_df.to_csv("data/distilbert_xgboost_pred.csv", index=False)

In [69]:
train_df_copy = train_df.copy()

In [76]:
train_df = train_df_copy.copy()

In [70]:
train_df.drop(columns=["ConfMedian", "ConfRange", "TweetRatio"], inplace=True)
train_df["MatchID"] = train_df["ID"].apply(lambda x: int(x.split("_")[0]))
train_df["PeriodID"] = train_df["ID"].apply(lambda x: int(x.split("_")[1]))
train_df.sort_values(by=["MatchID", "PeriodID"], inplace=True)
train_df.drop(columns=["MatchID", "PeriodID"], inplace=True)
train_df.to_csv("data/distilbert_xgboost_pred.csv", index=False)