# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm, trange

import tensorflow as tf
from tensorflow import keras
from keras import layers
import gc

import warnings
warnings.filterwarnings('ignore')

In [2]:
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "AVAILABLE" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.10.0
Keras Version: 2.10.0

GPU is AVAILABLE


In [3]:
#set tensorflow to user gpu if available
if gpu:
    print("Using GPU")
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
else:
    print("Not using CPU")

Using GPU


In [4]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

# Data Analysis

In [5]:
df = pd.read_csv('data/train_0.csv')
df.shape

(2149381, 61)

In [6]:
df.head()

Unnamed: 0,game_num,event_id,event_time,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,...,boost0_timer,boost1_timer,boost2_timer,boost3_timer,boost4_timer,boost5_timer,player_scoring_next,team_scoring_next,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,1,1002,-33.31303,-0.0,0.0,1.8548,-0.0,0.0,0.0,41.8048,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
1,1,1002,-33.206146,-0.0,0.0,1.8548,-0.0,0.0,0.0,42.2616,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
2,1,1002,-33.098114,-0.0,0.0,1.8548,-0.0,0.0,0.0,43.227,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
3,1,1002,-32.99319,-0.0,0.0,1.8548,-0.0,0.0,0.0,43.8984,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
4,1,1002,-32.887756,-0.0,0.0,1.8548,-0.0,0.0,0.0,44.9606,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0


# Data Engineering

In [7]:
goal1_pos_x = 10
goal1_pos_y = -105
goal1_pos_z = 0

goal2_pos_x = -10
goal2_pos_y = -105
goal2_pos_z = 0

In [8]:
#calculate distance between ball and player
def distance(x1, y1, z1, x2, y2, z2):
    return np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)
    
#calculate velocity of ball
def velocity(x1, y1, z1):
    return np.sqrt(x1**2 + y1**2 + z1**2)
    
def get_new_features(df):
    for j in range(0, 6):
        df[f'p{j}_dis_ball'] = distance(df['ball_pos_x'], df['ball_pos_y'], df['ball_pos_z'], df[f'p{j}_pos_x'], df[f'p{j}_pos_y'], df[f'p{j}_pos_z'])
        df[f'p{j}_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, df[f'p{j}_pos_x'], df[f'p{j}_pos_y'], df[f'p{j}_pos_z'])
        df[f'p{j}_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, df[f'p{j}_pos_x'], df[f'p{j}_pos_y'], df[f'p{j}_pos_z'])
        df[f'p{j}_velocity'] = velocity(df[f'p{j}_vel_x'], df[f'p{j}_vel_y'], df[f'p{j}_vel_z'])

    df[f'ball_vel'] = velocity(df[f'ball_vel_x'], df[f'ball_vel_y'], df[f'ball_vel_z'])
    df[f'ball_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, df[f'ball_pos_x'], df[f'ball_pos_y'], df[f'ball_pos_z'])
    df[f'ball_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, df[f'ball_pos_x'], df[f'ball_pos_y'], df[f'ball_pos_z'])

    df[f't1_dis_ball'] = df[f'p0_dis_ball'] + df[f'p1_dis_ball'] + df[f'p2_dis_ball']
    df[f't1_dis_goal1'] = df[f'p0_dis_goal1'] + df[f'p1_dis_goal1'] + df[f'p2_dis_goal1']
    df[f't1_dis_goal2'] = df[f'p0_dis_goal2'] + df[f'p1_dis_goal2'] + df[f'p2_dis_goal2']

    df[f't2_dis_ball'] = df[f'p3_dis_ball'] + df[f'p4_dis_ball'] + df[f'p5_dis_ball']
    df[f't2_dis_goal1'] = df[f'p3_dis_goal1'] + df[f'p4_dis_goal1'] + df[f'p5_dis_goal1']
    df[f't2_dis_goal2'] = df[f'p3_dis_goal2'] + df[f'p4_dis_goal2'] + df[f'p5_dis_goal2']

    return df

In [9]:

for i in range(10):
    path = f'data/train_{i}.csv'
    df_temp= pd.read_csv(path)

    #read 10 files into one dataframe
    if i == 0:
        df = df_temp.copy()
    else:
        df = pd.concat([df, df_temp], axis=0)

df.drop(['game_num', 'event_id', 'event_time', 'player_scoring_next', 'team_scoring_next'], axis=1, inplace=True)

df['no_team_scored'] = np.logical_xor(df['team_A_scoring_within_10sec'], df['team_B_scoring_within_10sec'])
df['no_team_scored'] = (~df['no_team_scored']).astype(int)

X = df.drop(['team_A_scoring_within_10sec', 'team_B_scoring_within_10sec', 'no_team_scored'], axis=1)
y = df[['team_A_scoring_within_10sec', 'team_B_scoring_within_10sec', 'no_team_scored']]

X_train, X_val,\
y_train, y_val = train_test_split(X, y, test_size = 0.01, random_state=0)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('std_scaler', StandardScaler()),
])

X_train = get_new_features(X_train)
X_val = get_new_features(X_val)

X_val.head()

Unnamed: 0,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,p0_pos_y,p0_pos_z,p0_vel_x,...,p5_velocity,ball_vel,ball_dis_goal1,ball_dis_goal2,t1_dis_ball,t1_dis_goal1,t1_dis_goal2,t2_dis_ball,t2_dis_goal1,t2_dis_goal2
373908,-28.1702,-43.8228,24.0866,23.046799,-39.096,9.273,-23.938,-102.0596,12.938399,12.919001,...,36.478764,46.321061,76.024853,68.212684,177.202825,171.095182,161.623261,194.74981,335.408771,327.528219
1340703,67.0324,26.9916,6.7344,43.917202,25.1326,-0.1254,-12.7286,9.1304,1.5168,-29.3234,...,33.322783,50.600237,143.943841,152.974263,151.236594,343.355985,363.301333,136.015379,462.546108,474.095146
591893,37.3242,40.1516,14.4388,-30.8048,39.913403,19.174799,45.096203,-9.7024,0.3402,13.2022,...,16.224641,53.941527,148.405114,153.352684,95.560704,416.684355,433.651541,127.659802,514.788414,530.541564
812495,14.058399,47.034,20.2348,-56.9544,-11.7224,22.360802,-15.516601,60.308598,0.3398,-2.8294,...,4.208329,62.299469,153.428338,155.250091,150.425227,488.056015,501.096671,155.139157,473.867833,488.769823
1385857,-54.6252,-71.2984,12.7876,-10.5896,-31.323,-1.6548,-56.2416,-63.9498,0.3402,18.060999,...,26.698055,33.106016,73.998223,57.364876,95.156471,190.357617,149.660924,191.3353,353.633323,322.15109


In [11]:
X_train = pd.DataFrame(num_pipeline.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(num_pipeline.transform(X_val), columns=X_val.columns)

X_val.head()

Unnamed: 0,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,p0_pos_y,p0_pos_z,p0_vel_x,...,p5_velocity,ball_vel,ball_dis_goal1,ball_dis_goal2,t1_dis_ball,t1_dis_goal1,t1_dis_goal2,t2_dis_ball,t2_dis_goal1,t2_dis_goal2
0,-0.570442,-0.734366,0.803933,0.994968,-1.394887,0.74544,-0.52431,-1.423876,1.4586,0.617019,...,0.511763,0.739478,-0.858329,-1.004313,0.07487,-1.034569,-1.107615,0.376046,-0.576857,-0.639359
1,1.371875,0.445582,-0.8551,1.895923,0.895462,-0.019543,-0.27705,0.485142,-0.304076,-1.399243,...,0.221809,1.031553,0.425777,0.597045,-0.376049,0.331996,0.49098,-0.641474,0.439249,0.531586
2,0.76577,0.664861,-0.118487,-1.329753,1.422536,1.551396,0.998468,0.161802,-0.485658,0.630536,...,-1.34907,1.259613,0.510124,0.604195,-1.342893,0.913719,1.048609,-0.786226,0.856779,0.982545
3,0.291103,0.779539,0.435664,-2.458606,-0.418763,1.810721,-0.338548,1.363819,-0.48572,-0.134664,...,-2.45306,1.830085,0.605095,0.640042,-0.390139,1.479919,1.583211,-0.310172,0.529734,0.648824
4,-1.110176,-1.192179,-0.276358,-0.457082,-1.117707,-0.144028,-1.236873,-0.76957,-0.485658,0.86245,...,-0.386833,-0.162515,-0.896645,-1.209256,-1.349912,-0.881758,-1.202434,0.316893,-0.431203,-0.682317


# Model Development

In [13]:
model = keras.models.Sequential([
    keras.layers.Input(input_shape = [87]),
    keras.layers.AlphaDropout(rate=0.2),
    layers.Dense(256, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(256, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(256, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(256, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(3)
])

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer="nadam",
              metrics=["accuracy"])

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (BatchN  (None, 87)               348       
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 256)               22528     
                                                                 
 batch_normalization_1 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 batch_normalization_2 (Batc  (None, 256)             

In [16]:
import os
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [17]:
from tabnanny import verbose

print(f'Training on dataset.')

checkpoint_cb = keras.callbacks.ModelCheckpoint("model.h5", save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, y_train, 
            epochs=100, 
            steps_per_epoch=5000, 
            batch_size=32,
            validation_data=(X_val, y_val), 
            verbose=1,
            callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb])

del df, X_train, X_val, y_train, y_val
gc.collect()
print()

Training on dataset.
Epoch 1/100


2022-10-29 16:16:28.958202: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-10-29 16:16:30.203990: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-29 16:22:22.682615: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
   65/10000 [..............................] - ETA: 5:50 - loss: 0.4426 - accuracy: 0.8832

KeyboardInterrupt: 

In [None]:
df_test = pd.read_csv('data/test.csv')
df_test = df_test.drop(df_test.columns[0], axis=1)

df_test.isnull().sum().sum()

261422

In [None]:
df_test = get_new_features(df_test)
df_test = pd.DataFrame(num_pipeline.transform(df_test), columns=df_test.columns)

df_test.isnull().sum().sum()

0

In [None]:
preds = model.predict(df_test)
score = tf.nn.softmax(preds)

   90/21911 [..............................] - ETA: 37s

2022-10-29 02:10:19.673127: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [None]:
ss = pd.read_csv('data/sample_submission.csv')
ss['team_A_scoring_within_10sec'] = score[:,0]
ss['team_B_scoring_within_10sec'] = score[:,1]
ss.to_csv('Submission.csv', index=False)
ss.head()

Unnamed: 0,id,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,0,0.0,0.0
1,1,0.0,0.0
2,2,0.0,0.0
3,3,0.0,0.0
4,4,0.0,2.4781959999999998e-36
