In [None]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import (
    GridSearchCV, GroupShuffleSplit, cross_val_score)
from sklearn.metrics import accuracy_score

# Train on Word Embeddings
## Get Embeddings

In [28]:
import re
import gensim.downloader
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # remove links
    tweet = re.sub(r"http\S+|https\S+|www\S+", '', tweet)
    # remove all punctuation
    tweet = re.sub(r"[^\w\s]|_+", ' ', tweet)
    return tweet
def get_embedding(tweet, embedding_model, vector_size=100):
    tweet = preprocess_tweet(tweet)
    words = tweet.split()
    word_vectors = [embedding_model[word] for word in words if word in embedding_model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [14]:
embedding_model = gensim.downloader.load("glove-twitter-100")

In [4]:
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        df = pd.read_csv(os.path.join(folder, filename))
        df.drop_duplicates(subset="Tweet", inplace=True)
        data.append(df)
    return pd.concat(data)

In [5]:
train_df = load_data("data/train_tweets")
test_df = load_data("data/eval_tweets")

In [29]:
train_df["Embedding"] = train_df["Tweet"].apply(lambda x: get_embedding(x, embedding_model))
test_df["Embedding"] = test_df["Tweet"].apply(lambda x: get_embedding(x, embedding_model))

In [30]:
train_df_backup = train_df.copy()
test_df_backup = test_df.copy()

In [97]:
train_df = train_df_backup.copy()
test_df = test_df_backup.copy()

In [31]:
def get_features_embedding(df, dim=100, label=True):
    df["MatchID"] = df["MatchID"].apply(int)
    df["PeriodID"] = df["PeriodID"].apply(int)
    df.sort_values(by=["MatchID", "PeriodID"], inplace=True)

    features_df = pd.DataFrame(df["ID"].unique())
    features_df.rename(columns={0: "ID"}, inplace=True)
    if label:
        features_df["EventType"] = (df.groupby("ID", sort=False)
                                    ["EventType"].first().values)

    embed_avgs = np.empty((0, dim))
    embed_stds = np.empty((0, dim))
    embed_prev = np.empty((0, dim))
    embed_next = np.empty((0, dim))
    last_period = df.groupby("MatchID", sort=False)["PeriodID"].last()
    grouped_df = df.groupby("ID", sort=False)
    for (i, group) in enumerate(grouped_df.groups):
        group_df = grouped_df.get_group(group).reset_index()
        embeddings = np.array(group_df["Embedding"].to_list())
        embed_avg = np.mean(embeddings, axis=0)
        embed_std = np.std(embeddings, axis=0)
        embed_avgs = np.vstack((embed_avgs, embed_avg))
        embed_stds = np.vstack((embed_stds, embed_std))
        if group_df["PeriodID"][0] == 0:
            count = 0
            embed_prev = np.vstack((embed_prev, np.zeros(dim)))
        else:
            count += 1
            embed_next = np.vstack((embed_next, embed_avg))
        if group_df["PeriodID"][0] == last_period[group_df["MatchID"][0]]:
            embed_next = np.vstack((embed_next, np.zeros(dim)))
            match_embed = embed_avgs[i-count:i+1]
            match_embed_avg = np.mean(match_embed, axis=0)
            match_embed_std = np.std(match_embed, axis=0)
            embed_avgs[i-count:i+1] -= match_embed_avg
            embed_stds[i-count:i+1] -= match_embed_std
            embed_prev[i-count+1:i+1] -= match_embed_avg
            embed_next[i-count:i] -= match_embed_avg
        else:
            embed_prev = np.vstack((embed_prev, embed_avg))
    features_df["EmbedAVG"] = list(embed_avgs)
    features_df["EmbedSTD"] = list(embed_stds)
    features_df["EmbedPrev"] = list(embed_prev)
    features_df["EmbedNext"] = list(embed_next)

    features_df["MatchID"] = (df.groupby("ID", sort=False)["MatchID"]
                              .first().to_numpy())
    tweet_per_match = (df.groupby("MatchID", sort=False)["Tweet"]
                       .count().reset_index())
    match_to_idx = {match: idx for idx, match
                    in enumerate(tweet_per_match["MatchID"])}
    match_tweet_count = features_df["MatchID"].apply(
        lambda x: tweet_per_match["Tweet"][match_to_idx[x]])
    features_df["TweetRatio"] = (df.groupby("ID", sort=False)["Tweet"]
                                 .count().values / match_tweet_count)

    return features_df

In [32]:
train_df = get_features_embedding(train_df)
test_df = get_features_embedding(test_df, label=False)

In [33]:
X_train = np.hstack((np.array(train_df["EmbedAVG"].to_list()),
                     np.array(train_df["EmbedSTD"].to_list()),
                     np.array(train_df["EmbedPrev"].to_list()),
                     np.array(train_df["EmbedNext"].to_list()),
                     train_df["TweetRatio"].to_numpy().reshape(-1, 1)))
y_train = train_df["EventType"].to_numpy()

X_test = np.hstack((np.array(test_df["EmbedAVG"].to_list()),
                    np.array(test_df["EmbedSTD"].to_list()),
                    np.array(test_df["EmbedPrev"].to_list()),
                    np.array(test_df["EmbedNext"].to_list()),
                    test_df["TweetRatio"].to_numpy().reshape(-1, 1)))

# standardize features
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

## Model Selection

In [34]:
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [35]:
names = ["Logistic Regression", "SVC", "Linear SVC",
         "QDA", "KNN", "Gaussian NB", "Random Forest",
         "AdaBoost", "Gradient Boosting", "MLP", "XGBoost"]
models = [
    LogisticRegression(max_iter=1000),
    SVC(),
    LinearSVC(max_iter=10000),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier((100, 100), max_iter=1000),
    XGBClassifier(objective="binary:logistic", booster="gblinear",
                  tree_method="exact", eval_metric=accuracy_score)
]

In [36]:
for name, model in zip(names, models):
    scores = cross_val_score(
        model, X_train, y_train, groups=train_df["MatchID"].values,
        scoring="accuracy", n_jobs=-1, cv=GroupShuffleSplit(random_state=42))
    print(name, ": mean score = ", scores.mean(),
          ", score std = ", scores.std(), sep = '')

Logistic Regression: mean score = 0.6644281536566565, score std = 0.031085986673913293
SVC: mean score = 0.7077530108295114, score std = 0.054286322563554576
Linear SVC: mean score = 0.6522288076782234, score std = 0.039355408100349334
QDA: mean score = 0.5362636905356369, score std = 0.04549489418813663
KNN: mean score = 0.6766369488895206, score std = 0.04098753163379512
Gaussian NB: mean score = 0.6377158924167812, score std = 0.07386106771424668
Random Forest: mean score = 0.7100964643399478, score std = 0.04588609674213177




AdaBoost: mean score = 0.689467103022481, score std = 0.03790410177723541
Gradient Boosting: mean score = 0.7255465330967179, score std = 0.03718930369886771
MLP: mean score = 0.69921493084631, score std = 0.03134951197628419


Parameters: { "tree_method" } are not used.

Parameters: { "tree_method" } are not used.

Parameters: { "tree_method" } are not used.

Parameters: { "tree_method" } are not used.

Parameters: { "tree_method" } are not used.



XGBoost: mean score = 0.6688440027963647, score std = 0.027565806474649148


## Hyperparameter Tuning

In [37]:
param_grid = {
    "max_depth": np.linspace(3, 8, 6, dtype=np.int64),
    "min_child_weight": np.linspace(1, 10, 10, dtype=np.int64)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", tree_method="exact",
                  booster="gbtree", eval_metric=accuracy_score, random_state=42),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7327356615886528 +/- 0.0641660835401166


{'max_depth': 7, 'min_child_weight': 7}

In [38]:
param_grid = {
    "subsample": np.linspace(0.5, 1.0, 6),
    "colsample_bytree": np.linspace(0.5, 1.0, 6)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=7, min_child_weight=7),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7327356615886528 +/- 0.0641660835401166


{'colsample_bytree': 1.0, 'subsample': 1.0}

In [39]:
param_grid = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "n_estimators": [50, 100, 300, 500, 800, 1000]
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=7, min_child_weight=7),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7334773256305265 +/- 0.08238640772574293


{'learning_rate': 0.05, 'n_estimators': 50}

In [41]:
param_grid = {
    "reg_alpha": np.logspace(-1, 2, 4),
    "reg_lambda": np.logspace(-1, 2, 4)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=7, min_child_weight=7,
                  learning_rate=0.3, n_estimators=100),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7351190247549979 +/- 0.06716372985068172


{'reg_alpha': 10.0, 'reg_lambda': 1.0}

In [42]:
param_grid = {
    "reg_alpha": np.linspace(5, 30, 6),
    "reg_lambda": np.linspace(0.5, 3, 6)
}
grid_search = GridSearchCV(
    XGBClassifier(objective="binary:logistic", eval_metric=accuracy_score,
                  tree_method="exact", booster="gbtree", random_state=42,
                  max_depth=7, min_child_weight=7,
                  learning_rate=0.3, n_estimators=100),
    param_grid, scoring="accuracy", n_jobs=-1,
    cv=GroupShuffleSplit(random_state=42))
grid_search.fit(X_train, y_train, groups=train_df["MatchID"].to_numpy())
print("Best accuracy: ", grid_search.best_score_, "+/-",
      2 * grid_search.cv_results_["std_test_score"][grid_search.best_index_])
grid_search.best_params_

Best accuracy:  0.7351190247549979 +/- 0.06716372985068172


{'reg_alpha': 10.0, 'reg_lambda': 1.0}

## Predict

In [43]:
xgbclf = XGBClassifier(
    objective="binary:logistic", eval_metric=accuracy_score,
    tree_method="exact", booster="gbtree", random_state=42,
    max_depth=6, min_child_weight=4, # subsample=0.7, colsample_bytree=0.5,
    reg_alpha=10.0, reg_lambda=1.0, learning_rate=0.3, n_estimators=100)
xgbclf.fit(X_train, y_train)

In [124]:
xgbclf.feature_importances_

array([5.7936427e-03, 6.3698746e-02, 1.7936929e-03, 7.1609643e-04,
       1.1503189e-02, 0.0000000e+00, 1.2814348e-02, 0.0000000e+00,
       1.3128836e-03, 0.0000000e+00, 1.4437318e-03, 1.0033710e-02,
       2.2017707e-03, 6.8314904e-03, 3.4913097e-03, 2.2605036e-03,
       0.0000000e+00, 5.2046031e-03, 1.0420449e-02, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.3930235e-05,
       1.4216763e-03, 0.0000000e+00, 4.7631650e-03, 0.0000000e+00,
       1.2580832e-02, 1.0736532e-03, 2.5150136e-03, 0.0000000e+00,
       4.6474198e-03, 2.6539455e-03, 0.0000000e+00, 1.9949262e-03,
       1.1283332e-02, 0.0000000e+00, 3.1016346e-03, 2.2061539e-03,
       1.5957153e-03, 6.0652304e-03, 0.0000000e+00, 1.2113618e-03,
       0.0000000e+00, 0.0000000e+00, 1.7270067e-03, 1.8506893e-03,
       1.2026102e-03, 4.2923670e-03, 7.5421385e-06, 1.8395314e-02,
       1.9819008e-03, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 1.6375838e-02, 0.0000000e+00, 0.0000000e

In [44]:
test_df_copy = test_df.copy()

In [95]:
test_df = test_df_copy.copy()

In [45]:
test_df["EventType"] = xgbclf.predict(X_test)
test_df.drop(columns=["MatchID", "TweetRatio", "EmbedAVG",
                      "EmbedSTD", "EmbedPrev", "EmbedNext"],
             inplace=True)
test_df["MatchID"] = test_df["ID"].apply(lambda x: int(x.split("_")[0]))
test_df["PeriodID"] = test_df["ID"].apply(lambda x: int(x.split("_")[1]))
test_df.sort_values(by=["MatchID", "PeriodID"], inplace=True)
test_df.drop(columns=["MatchID", "PeriodID"], inplace=True)
test_df["EventType"] = test_df["EventType"].apply(lambda x: float(x))
test_df.to_csv("data/xgboost_glove_pred.csv", index=False)