In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Loading the dataset

In [18]:
df = pd.read_csv(r"C:\repository\understat_shot_data\shots_data.csv")
df.head()

Unnamed: 0,id,minute,result,X,Y,xG,player,h_a,player_id,situation,year,shotType,match_id,h_team,a_team,h_goals,a_goals,date,player_assisted,lastAction
0,378451.0,20,BlockedShot,0.876,0.602,0.019479,Ollie Watkins,h,8865,OpenPlay,2020,Head,14104,Aston Villa,Sheffield United,1,0,2020-09-21 17:00:00,Matthew Cash,Aerial
1,378458.0,54,MissedShots,0.878,0.43,0.031428,Ollie Watkins,h,8865,OpenPlay,2020,Head,14104,Aston Villa,Sheffield United,1,0,2020-09-21 17:00:00,Trézéguet,Cross
2,378464.0,77,MissedShots,0.858,0.679,0.060696,Ollie Watkins,h,8865,OpenPlay,2020,RightFoot,14104,Aston Villa,Sheffield United,1,0,2020-09-21 17:00:00,John McGinn,Pass
3,379954.0,34,MissedShots,0.831,0.655,0.12771,Ollie Watkins,a,8865,OpenPlay,2020,RightFoot,14114,Fulham,Aston Villa,0,3,2020-09-28 17:00:00,Jack Grealish,TakeOn
4,379956.0,46,MissedShots,0.943,0.5,0.445354,Ollie Watkins,a,8865,FromCorner,2020,Head,14114,Fulham,Aston Villa,0,3,2020-09-28 17:00:00,Conor Hourihane,Cross


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292070 entries, 0 to 292069
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               292070 non-null  float64
 1   minute           292070 non-null  int64  
 2   result           292070 non-null  object 
 3   X                292070 non-null  float64
 4   Y                292070 non-null  float64
 5   xG               292070 non-null  float64
 6   player           292070 non-null  object 
 7   h_a              292070 non-null  object 
 8   player_id        292070 non-null  int64  
 9   situation        292070 non-null  object 
 10  year             292070 non-null  int64  
 11  shotType         292070 non-null  object 
 12  match_id         292070 non-null  int64  
 13  h_team           292070 non-null  object 
 14  a_team           292070 non-null  object 
 15  h_goals          292070 non-null  int64  
 16  a_goals          292070 non-null  int6

### Feature-engineering and Preprocessing

In [19]:
target = 'isGoal'

def get_angle(b):
    b = np.array(b)
    a = np.array([100,45])
    c = np.array([100,55])
    ba = a - b
    bc = c - b

    cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
    angle = np.arccos(cosine_angle)
    return np.degrees(angle)

##target encoding
df = df[~df['result'].isin(['BlockedShot', 'OwnGoal'])]

df[target] = df['result'].map({'MissedShots': 0, 
                               'SavedShot': 0, 
                               'ShotOnPost': 0, 
                               'Goal': 1})

##features encoding
df["h_state"] = np.where(df["h_goals"]==df["a_goals"], 2, np.where(df["h_goals"]>df["a_goals"], 1, 0)) ##get home state
df["a_state"] = df["h_state"].map({2:2, 1:0, 0:1}) ##get away state by inverting home state

df["game_state"] = np.where(df["h_a"]=='h', df["h_state"], df["a_state"]) ##get game_state of given team (h/a)
df.drop(["a_goals", "h_goals", "h_a"], axis=1, inplace=True) ##drop the columns we don't need anymore
df["shotType"] = df["shotType"].map({'RightFoot':0, 
                                        'LeftFoot':0,
                                        'OtherBodyPart':2, 
                                        'Head':1}) ##map to bodypart

df["situation"] = df["situation"].map({"OpenPlay": 0,
                                        "FromCorner":1, 
                                        "SetPiece": 2, 
                                        "DirectFreekick": 3}) ##map to situations

df[["X", "Y"]] = df[["X", "Y"]]*100
df["goal_x"] = 100
df["goal_y"] = 50
df["length"] = np.sqrt( np.square(df["goal_x"] - df["X"]) + np.square(df["goal_y"] - df["Y"]) )
df["angle"] = df[["X", "Y"]].apply(get_angle, axis=1)

features = ['minute', 'X', 'Y', 'situation', 'shotType', 'game_state', 'length', 'angle']

df = df[[target]+features]
df = df.dropna()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(172465, 9)
(43117, 9)


### Modelling

#### Building PyTorch Dataset and Dataloader

In [32]:
from torch.utils.data import DataLoader, Dataset

batch_size = 100

class ShotsDataset(Dataset):
    """Creating a pytorch dataloader for shots. 
    Need to implement three functions: __init__, __getitem__, and __len__
    """

    def __init__(self, df):

        xy = df.copy(deep=True).values
        xy[:, 1:] = StandardScaler().fit_transform(xy[:, 1:])

        self.x = torch.from_numpy(xy[:, 1:].astype(np.float32)) ## size [n_samples, n_features]
        self.y = torch.from_numpy(xy[:, [0]].astype(np.float32)) ## size [n_samples, 1]
        self.n_samples = xy.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx] ## returns x, y

    def __len__(self):
        return self.n_samples

train_dataset = ShotsDataset(train_df)
test_dataset = ShotsDataset(test_df)

train_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [33]:
for (x, y) in train_dataset:
    print(x)
    print(y)
    break

tensor([ 1.6318, -0.1743, -1.3000, -0.5207, -0.5016,  0.1148,  0.4136, -0.5906])
tensor([0.])


#### Specifying Model Architecture

In [35]:
input_size = x.shape[0]
hidden_size = 64
num_classes = 1

class XGModel(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes):

        super(XGModel, self).__init__()
        self.input_size = input_size
        self.fc1 = nn.Linear(self.input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):

        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = torch.sigmoid(out) ##sigmoid activation function for last layer
        
        return out

model = XGModel(input_size, hidden_size, num_classes)
criterion = nn.BCELoss() ##binary cross-entropy without logits
lr = 0.001 ##mess around with this (0.0001 - 0.01)
optimizer = torch.optim.SGD(model.parameters(), lr=lr) ##try Adam

#### Training Loop

In [66]:
epochs = 10 ##increase epochs for better fitting
n_total_steps = len(train_loader)
for epoch in range(epochs):

    for num, (xs, ys) in enumerate(train_loader):
        ##forward pass
        ys_pred = model(xs)
        loss = criterion(ys_pred, ys)
        
        ##backward pass
        loss.backward()
        optimizer.step()
        
        ##empty gradients for every batch
        optimizer.zero_grad()

        if (num+1) % 400 == 0: ##print out some feedback every 400th batch
            print (f'Epoch [{epoch+1}/{epochs}], Step [{num+1}/{n_total_steps}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [400/1725], Loss: 0.4243
Epoch [1/10], Step [800/1725], Loss: 0.3496
Epoch [1/10], Step [1200/1725], Loss: 0.4041
Epoch [1/10], Step [1600/1725], Loss: 0.4003
Epoch [2/10], Step [400/1725], Loss: 0.4072
Epoch [2/10], Step [800/1725], Loss: 0.3285
Epoch [2/10], Step [1200/1725], Loss: 0.3951
Epoch [2/10], Step [1600/1725], Loss: 0.3966
Epoch [3/10], Step [400/1725], Loss: 0.3951
Epoch [3/10], Step [800/1725], Loss: 0.3143
Epoch [3/10], Step [1200/1725], Loss: 0.3903
Epoch [3/10], Step [1600/1725], Loss: 0.3945
Epoch [4/10], Step [400/1725], Loss: 0.3867
Epoch [4/10], Step [800/1725], Loss: 0.3045
Epoch [4/10], Step [1200/1725], Loss: 0.3879
Epoch [4/10], Step [1600/1725], Loss: 0.3931
Epoch [5/10], Step [400/1725], Loss: 0.3807
Epoch [5/10], Step [800/1725], Loss: 0.2975
Epoch [5/10], Step [1200/1725], Loss: 0.3870
Epoch [5/10], Step [1600/1725], Loss: 0.3921
Epoch [6/10], Step [400/1725], Loss: 0.3764
Epoch [6/10], Step [800/1725], Loss: 0.2924
Epoch [6/10], Step [12

### Evaluating Model

In [67]:
from sklearn.metrics import roc_auc_score

pred_ys = []
true_ys = []
with torch.no_grad():
    for (test_x, test_y) in test_loader:
        y_pred = np.array(model(test_x)).flatten()
        pred_ys.append(y_pred); true_ys.append(np.array(test_y).flatten())
        
pred_ys = np.concatenate(pred_ys)
true_ys = np.concatenate(true_ys)

roc_auc_score(true_ys, pred_ys) ##using AUC-ROC score       