# Importing Libraries

In [23]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm, trange

import tensorflow as tf
from tensorflow import keras
from keras import layers
import gc

import warnings
warnings.filterwarnings('ignore')

In [24]:
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "AVAILABLE" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.10.0
Keras Version: 2.10.0

GPU is AVAILABLE


In [25]:
#set tensorflow to user gpu if available
if gpu:
    print("Using GPU")
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
else:
    print("Not using CPU")

Using GPU


# Data Analysis

In [26]:
df = pd.read_csv('data/train_0.csv')
df.shape

(2149381, 61)

In [27]:
df.head()

Unnamed: 0,game_num,event_id,event_time,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,...,boost0_timer,boost1_timer,boost2_timer,boost3_timer,boost4_timer,boost5_timer,player_scoring_next,team_scoring_next,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,1,1002,-33.31303,-0.0,0.0,1.8548,-0.0,0.0,0.0,41.8048,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
1,1,1002,-33.206146,-0.0,0.0,1.8548,-0.0,0.0,0.0,42.2616,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
2,1,1002,-33.098114,-0.0,0.0,1.8548,-0.0,0.0,0.0,43.227,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
3,1,1002,-32.99319,-0.0,0.0,1.8548,-0.0,0.0,0.0,43.8984,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
4,1,1002,-32.887756,-0.0,0.0,1.8548,-0.0,0.0,0.0,44.9606,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0


#### Even if team_scoring_next is B the team_B_scoring_within_10sec is 0 this is because the event time is -33 that implies that there is still 33 sec in the game to come to an end. And the target column is about last 10sec this is why is contains 0.

Some of the important points regarding extra columns than the testing data is that:
* game_num, event_id, event_time these three are not there in the testing data.
* player_scoring_next or team_scoring_next contain informaton regarding the target variable.******

### We will be removing team_scoring_next column any how and in the remaining columns removing 20000 data points from 2 million rows of data is not going to effect the model's performance much.

#### Here we can see that there are no highly correlated features here.

# Model Development

In [30]:
goal1_pos_x = 10
goal1_pos_y = -105
goal1_pos_z = 0

goal2_pos_x = -10
goal2_pos_y = -105
goal2_pos_z = 0

In [31]:
input_ = keras.layers.Input(shape=[87])
hidden1 = keras.layers.Dense(128, activation='relu')(input_)
hidden2 = keras.layers.Dense(128, activation='relu')(hidden1)
concat = keras.layers.Concatenate()([input_, hidden2])
output = keras.layers.Dense(3, activation='softmax')(concat)
model = keras.Model(inputs=[input_], outputs=[output])

model.compile(optimizer='SGD', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 256)               22528     
                                                                 
 batch_normalization_4 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 256)               65792     
                                                                 
 batch_normalization_5 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dropout_5 (Dropout)         (None, 256)              

In [33]:
to_remove = ['game_num', 'event_id', 'event_time', 'player_scoring_next', 'team_scoring_next']

In [34]:
#calculate distance between ball and player
def distance(x1, y1, z1, x2, y2, z2):
    return np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)

In [35]:
#calculate velocity of ball
def velocity(x1, y1, z1):
    return np.sqrt(x1**2 + y1**2 + z1**2)

In [None]:
import os
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [36]:
for i in range(10):
    path = f'data/train_{i}.csv'
    df_temp= pd.read_csv(path)

    #read 10 files into one dataframe
    if i == 0:
        df = df_temp.copy()
    else:
        df = pd.concat([df, df_temp], axis=0)

# Data loading and preprocessing 
df['no_team_scored'] = np.logical_xor(df['team_A_scoring_within_10sec'],
                                        df['team_B_scoring_within_10sec'])
df['no_team_scored'] = (~df['no_team_scored']).astype(int)

df.drop(to_remove, axis=1, inplace=True)
df.dropna(inplace=True, axis=0)

features = df.loc[:,'ball_pos_x':'boost5_timer']
target = df.loc[:,'team_A_scoring_within_10sec':'no_team_scored']

#add distance column to train and test
for j in range(0, 6):
    features[f'p{j}_dis_ball'] = distance(features['ball_pos_x'], features['ball_pos_y'], features['ball_pos_z'], features[f'p{j}_pos_x'], features[f'p{j}_pos_y'], features[f'p{j}_pos_z'])
    features[f'p{j}_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, features[f'p{j}_pos_x'], features[f'p{j}_pos_y'], features[f'p{j}_pos_z'])
    features[f'p{j}_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, features[f'p{j}_pos_x'], features[f'p{j}_pos_y'], features[f'p{j}_pos_z'])
    features[f'p{j}_velocity'] = velocity(features[f'p{j}_vel_x'], features[f'p{j}_vel_y'], features[f'p{j}_vel_z'])

features[f'ball_vel'] = velocity(features[f'ball_vel_x'], features[f'ball_vel_y'], features[f'ball_vel_z'])
features[f'ball_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, features[f'ball_pos_x'], features[f'ball_pos_y'], features[f'ball_pos_z'])
features[f'ball_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, features[f'ball_pos_x'], features[f'ball_pos_y'], features[f'ball_pos_z'])

features[f't1_dis_ball'] = features[f'p0_dis_ball'] + features[f'p1_dis_ball'] + features[f'p2_dis_ball']
features[f't1_dis_goal1'] = features[f'p0_dis_goal1'] + features[f'p1_dis_goal1'] + features[f'p2_dis_goal1']
features[f't1_dis_goal2'] = features[f'p0_dis_goal2'] + features[f'p1_dis_goal2'] + features[f'p2_dis_goal2']

features[f't2_dis_ball'] = features[f'p3_dis_ball'] + features[f'p4_dis_ball'] + features[f'p5_dis_ball']
features[f't2_dis_goal1'] = features[f'p3_dis_goal1'] + features[f'p4_dis_goal1'] + features[f'p5_dis_goal1']
features[f't2_dis_goal2'] = features[f'p3_dis_goal2'] + features[f'p4_dis_goal2'] + features[f'p5_dis_goal2']
    
X_train, X_val,\
Y_train, Y_val = train_test_split(features, target,
                                    test_size = 0.2,
                                    random_state=42)

print(f'Training on dataset.')

checkpoint_cb = keras.callbacks.ModelCheckpoint("model.h5", save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(X_train, Y_train, epochs=100, validation_data=(X_val, Y_val), callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb])

del df, X_train, X_val, Y_train, Y_val
gc.collect()
print()

Training on dataset.


2022-10-25 08:06:09.439574: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-10-25 08:07:56.318892: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.





#### From the above training we can say that training on all the data has no effect on the accuracy as the accuracy score or AUC is stick to the 90% and 96% respectively.

## Predictions
One of the important factor here is predicting three classes B's score, A's Score and no ones score to get better results on the leaderboard. Due to this reason only sum of the probabilities for just A and B won't be equal to one.

In [37]:
df_test = pd.read_csv('data/test.csv')
df_test.shape

(701143, 55)

In [38]:
df_test.isnull().sum().sum()

261422

In [39]:
for col in df_test.columns:
    if df_test[col].isnull().sum() > 0:
        temp = df_test[col].mean()
        df_test[col] = df_test[col].fillna(temp)

df_test.isnull().sum().sum()

0

In [40]:
#add distance column to train and test
for i in range(0, 6):
    df_test[f'p{i}_dis_ball'] = distance(df_test['ball_pos_x'], df_test['ball_pos_y'], df_test['ball_pos_z'], df_test[f'p{i}_pos_x'], df_test[f'p{i}_pos_y'], df_test[f'p{i}_pos_z'])
    df_test[f'p{i}_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, df_test[f'p{i}_pos_x'], df_test[f'p{i}_pos_y'], df_test[f'p{i}_pos_z'])
    df_test[f'p{i}_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, df_test[f'p{i}_pos_x'], df_test[f'p{i}_pos_y'], df_test[f'p{i}_pos_z'])
    df_test[f'p{i}_velocity'] = velocity(df_test[f'p{i}_vel_x'], df_test[f'p{i}_vel_y'], df_test[f'p{i}_vel_z'])

df_test[f'ball_vel'] = velocity(df_test[f'ball_vel_x'], df_test[f'ball_vel_y'], df_test[f'ball_vel_z'])
df_test[f'ball_dis_goal1'] = distance(goal1_pos_x, goal1_pos_y, goal1_pos_z, df_test[f'ball_pos_x'], df_test[f'ball_pos_y'], df_test[f'ball_pos_z'])
df_test[f'ball_dis_goal2'] = distance(goal2_pos_x, goal2_pos_y, goal2_pos_z, df_test[f'ball_pos_x'], df_test[f'ball_pos_y'], df_test[f'ball_pos_z'])

df_test[f't1_dis_ball'] = df_test[f'p0_dis_ball'] + df_test[f'p1_dis_ball'] + df_test[f'p2_dis_ball']
df_test[f't1_dis_goal1'] = df_test[f'p0_dis_goal1'] + df_test[f'p1_dis_goal1'] + df_test[f'p2_dis_goal1']
df_test[f't1_dis_goal2'] = df_test[f'p0_dis_goal2'] + df_test[f'p1_dis_goal2'] + df_test[f'p2_dis_goal2']

df_test[f't2_dis_ball'] = df_test[f'p3_dis_ball'] + df_test[f'p4_dis_ball'] + df_test[f'p5_dis_ball']
df_test[f't2_dis_goal1'] = df_test[f'p3_dis_goal1'] + df_test[f'p4_dis_goal1'] + df_test[f'p5_dis_goal1']
df_test[f't2_dis_goal2'] = df_test[f'p3_dis_goal2'] + df_test[f'p4_dis_goal2'] + df_test[f'p5_dis_goal2']

In [41]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701143 entries, 0 to 701142
Data columns (total 88 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              701143 non-null  int64  
 1   ball_pos_x      701143 non-null  float64
 2   ball_pos_y      701143 non-null  float64
 3   ball_pos_z      701143 non-null  float64
 4   ball_vel_x      701143 non-null  float64
 5   ball_vel_y      701143 non-null  float64
 6   ball_vel_z      701143 non-null  float64
 7   p0_pos_x        701143 non-null  float64
 8   p0_pos_y        701143 non-null  float64
 9   p0_pos_z        701143 non-null  float64
 10  p0_vel_x        701143 non-null  float64
 11  p0_vel_y        701143 non-null  float64
 12  p0_vel_z        701143 non-null  float64
 13  p0_boost        701143 non-null  float64
 14  p1_pos_x        701143 non-null  float64
 15  p1_pos_y        701143 non-null  float64
 16  p1_pos_z        701143 non-null  float64
 17  p1_vel_x  

In [42]:
test = df_test.loc[:,'ball_pos_x':'t2_dis_goal2']

preds = model.predict(test)

   39/21911 [..............................] - ETA: 59s 

2022-10-25 08:15:43.802763: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [43]:
preds.shape

(701143, 3)

In [44]:
ss = pd.read_csv('data/sample_submission.csv')
ss['team_A_scoring_within_10sec'] = preds[:,0]
ss['team_B_scoring_within_10sec'] = preds[:,1]
ss.to_csv('Submission.csv', index=False)
ss.head()

Unnamed: 0,id,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,0,0.043137,0.017739
1,1,0.028453,0.035305
2,2,0.014787,0.111737
3,3,0.055,0.017336
4,4,0.019111,0.042746


#### This will be our baseline score as we have done nothing but fit and predict on the training and testing data respectively. Let's see what feature engineering or different techniques could help us to get better results than this.