# NLP with GloVe embeddings

## Set-up

In [1]:
import os
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import nltk
from nltk.tokenize import word_tokenize

In [2]:
os.chdir('..')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stevengeorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load pre-trained GloVe embeddings

In [4]:
%%time
word_to_vector = {}
with open('data/external/GloVe/glove.6B/glove.6B.100d.txt', "r") as f:
    # Each line starts with the word/character followed by the 100d vector representation
    for line in f:  # itertools.islice(f, 5):
        
        # Split by whitespace:
        components = line.split()
        
        word = components[0]
        vector_values = components[1:]
        
        vector_array = np.array(vector_values, dtype=np.float64)  # Convert vector to numpy array
        
        # Add to dictionary
        word_to_vector[word] = vector_array

CPU times: user 7.64 s, sys: 265 ms, total: 7.91 s
Wall time: 7.93 s


## Load training and dev data

In [5]:
train = pd.read_parquet('data/processed/train.parquet')
print(train.shape)

dev = pd.read_parquet('data/processed/dev.parquet')
print(dev.shape)

(749640, 13)
(92689, 13)


In [6]:
train.head()

Unnamed: 0,id_odsp,sort_order,time,text,event_type,event_team,opponent,is_goal,assist_method,fast_break,season,country,event_team_was_home
0,UFot0hit/,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Hamburg SV,Borussia Dortmund,0,Pass,0,2012,germany,0
1,UFot0hit/,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",Corner,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
2,UFot0hit/,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",Corner,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
3,UFot0hit/,4,7,Foul by Sven Bender (Borussia Dortmund).,Foul,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
4,UFot0hit/,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,Free kick won,Hamburg SV,Borussia Dortmund,0,,0,2012,germany,0


- Remove `time` and `event_team_was_home` for now but may want to add these back as they are not extracted from the text itself

In [7]:
def process_data_for_nlp(df):
    
    df = df.copy()
    
    # Sort by event order
    df.sort_values(['id_odsp', 'sort_order'], inplace=True)
    # Create target
    df['next_event_is_goal'] = df.groupby('id_odsp')['is_goal'].shift(-1)
    # Drop redundant columns
    df.drop(
        columns=['sort_order', 'event_type', 'event_team', 'opponent', 'is_goal', 'assist_method', 'fast_break', 'season', 'country', 'event_team_was_home', 'time'], 
        inplace=True
    )
    # Drop entries with null target due to -1 shift
    df.dropna(subset=['next_event_is_goal'], axis=0, inplace=True)
    
    df.reset_index(drop=True, inplace=True)
    
    return df

In [8]:
train_p = process_data_for_nlp(train)
dev_p = process_data_for_nlp(dev)

In [9]:
train_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal
0,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,0.0
1,004f4ING/,Foul by Maya Yoshida (Southampton).,0.0
2,004f4ING/,Dusan Tadic (Southampton) wins a free kick on ...,0.0
3,004f4ING/,Foul by Neil Taylor (Swansea City).,0.0
4,004f4ING/,Attempt saved. James Ward-Prowse (Southampton)...,0.0


In [10]:
dev_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal
0,00nmICd9/,Foul by Juan Manuel FalcA³n (Metz).,0.0
1,00nmICd9/,TiemouA© Bakayoko (Monaco) wins a free kick in...,0.0
2,00nmICd9/,Foul by Anthony Martial (Monaco).,0.0
3,00nmICd9/,Sylvain Marchal (Metz) wins a free kick in the...,0.0
4,00nmICd9/,Foul by Cheick Doukoure (Metz).,0.0


## Tokenize text commentary

GloVe we are using is uncased so change all text to lower case:

In [11]:
train_p['text_lowercase'] = train_p['text'].str.lower()

In [12]:
%%time
train_p['text_split'] = train_p['text_lowercase'].apply(lambda x: word_tokenize(x))

CPU times: user 1min 47s, sys: 751 ms, total: 1min 48s
Wall time: 1min 48s


In [13]:
train_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal,text_lowercase,text_split
0,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,0.0,bafetimbi gomis (swansea city) wins a free kic...,"[bafetimbi, gomis, (, swansea, city, ), wins, ..."
1,004f4ING/,Foul by Maya Yoshida (Southampton).,0.0,foul by maya yoshida (southampton).,"[foul, by, maya, yoshida, (, southampton, ), .]"
2,004f4ING/,Dusan Tadic (Southampton) wins a free kick on ...,0.0,dusan tadic (southampton) wins a free kick on ...,"[dusan, tadic, (, southampton, ), wins, a, fre..."
3,004f4ING/,Foul by Neil Taylor (Swansea City).,0.0,foul by neil taylor (swansea city).,"[foul, by, neil, taylor, (, swansea, city, ), .]"
4,004f4ING/,Attempt saved. James Ward-Prowse (Southampton)...,0.0,attempt saved. james ward-prowse (southampton)...,"[attempt, saved, ., james, ward-prowse, (, sou..."


In [14]:
max_length = train_p['text_split'].apply(len).max()
max_length

50

We assume that items which are not in the GloVe matrix are player names

In [15]:
train_p.shape[0]

742443

In [16]:
def zero_pad(unpadded_matrix, max_length):
    
    if unpadded_matrix.shape[1] < max_length:
        num_missing_items = max_length - unpadded_matrix.shape[1]
        zero_pad = np.zeros((unpadded_matrix.shape[0], num_missing_items))

    padded_matrix = np.concatenate((unpadded_matrix, zero_pad), axis=1)
    
    return padded_matrix

In [17]:
m = train_p.shape[0]

train_3d = np.zeros((50, m, 100))  # seq_len, examples, input_size

unknown_items = []  # Keep track of items not in GloVe

for i, row in train_p.iterrows():
    text_split = row['text_split']  # List of items in text commentary
    
    vector_list = []
    for item in text_split:
        try:
            word_vector = word_to_vector[item].reshape(100, 1)  # Get GloVe vector for item
            vector_list.append(word_vector)
        except KeyError:
            unknown_items.append(item)  # Add to unknown list
            word_vector = word_to_vector['name'].reshape(100, 1)  # Substitute unknown item with vector for 'name'
    
    text_commentary_matrix = np.concatenate(vector_list, axis=1)  # Concanenate vectors into a matrix of dim (100, num_items)
    text_commentary_matrix = zero_pad(text_commentary_matrix, max_length=max_length)  # zero-padding to (100, 50)
    
    train_3d[:, i, :] = text_commentary_matrix.T  # Add to 3D input matrix

In [18]:
len(set(unknown_items))

1884

In [19]:
train_3d.shape

(50, 742443, 100)

## y labels

In [20]:
y_train = train_p['next_event_is_goal'].values
y_dev = dev_p['next_event_is_goal'].values

## Create PyTorch custom dataset

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [21]:
from torch.utils.data import Dataset

In [22]:
class CommentaryDataset(Dataset):

    def __init__(self, X, y):
        """
        :param X: 3D numpy array (seq_len, examples, input_size)
        :param y: Labels
        """
        self.X = X
        self.y = y.reshape(-1, 1)

    def __len__(self):
        return self.X.shape[1]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.X[:, idx, :], self.y[idx]

In [23]:
train_comm = CommentaryDataset(X=train_3d, y=y_train)

In [24]:
train_comm

<__main__.CommentaryDataset at 0x7f7fbcf31c10>

In [25]:
train_3d.shape

(50, 742443, 100)

In [26]:
len(train_comm)

742443

## Train model

In [27]:
from torch import nn
from torch.utils.data import DataLoader

In [48]:
torch.set_num_threads(4)

In [50]:
torch.get_num_threads()

4

In [42]:
data_loader = DataLoader(train_comm, batch_size=256, num_workers=8)

In [51]:
class CommentaryClassifier(nn.Module):

    def __init__(self):
        super(CommentaryClassifier, self).__init__()
        # LSTM
        self.lstm = nn.LSTM(input_size=100, hidden_size=10)
        # MLPs
#         self.fc_1 = nn.Linear(in_features=30, out_features=20)
        self.fc_2 = nn.Linear(in_features=10, out_features=1)
        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        all_h_t, _ = self.lstm(x.float())
        h_T = all_h_t[:, -1, :]  # Final cell outputs
#         x = self.fc_1(h_T)
#         x = self.relu(x)
        x = self.fc_2(h_T)
        x = self.sigmoid(x)
        return x

In [63]:
commentary_model = CommentaryClassifier()

In [64]:
print(commentary_model)

CommentaryClassifier(
  (lstm): LSTM(100, 10)
  (fc_2): Linear(in_features=10, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)


In [65]:
import torch.optim as optim

criterion = nn.BCELoss()
optimiser = optim.Adam(params=commentary_model.parameters())

In [66]:
for epoch in range(2):

    for i, data in enumerate(data_loader):
        
        X, y = data

        optimiser.zero_grad()  # Set gradients to 0 otherwise will accumulate

        y_pred = commentary_model(X)
        loss = criterion(y_pred, y.float())
        if i % 50 == 0:
            print(i)
            print(loss)
        loss.backward()
        optimiser.step()

0
tensor(0.8488, grad_fn=<BinaryCrossEntropyBackward>)
50
tensor(0.6706, grad_fn=<BinaryCrossEntropyBackward>)
100
tensor(0.2989, grad_fn=<BinaryCrossEntropyBackward>)
150
tensor(0.2309, grad_fn=<BinaryCrossEntropyBackward>)
200
tensor(0.1327, grad_fn=<BinaryCrossEntropyBackward>)
250
tensor(0.0987, grad_fn=<BinaryCrossEntropyBackward>)
300
tensor(0.1119, grad_fn=<BinaryCrossEntropyBackward>)
350
tensor(0.1454, grad_fn=<BinaryCrossEntropyBackward>)
400
tensor(0.1970, grad_fn=<BinaryCrossEntropyBackward>)
450
tensor(0.1175, grad_fn=<BinaryCrossEntropyBackward>)
500
tensor(0.1305, grad_fn=<BinaryCrossEntropyBackward>)
550
tensor(0.1218, grad_fn=<BinaryCrossEntropyBackward>)
600
tensor(0.1581, grad_fn=<BinaryCrossEntropyBackward>)
650
tensor(0.1440, grad_fn=<BinaryCrossEntropyBackward>)
700
tensor(0.1159, grad_fn=<BinaryCrossEntropyBackward>)
750
tensor(0.1013, grad_fn=<BinaryCrossEntropyBackward>)
800
tensor(0.1082, grad_fn=<BinaryCrossEntropyBackward>)
850
tensor(0.1153, grad_fn=<Binary

KeyboardInterrupt: 

## References

https://nlp.stanford.edu/projects/glove/

## Questions

- How to deal with player names when using pretrained weights?

## TODO

- Change CSV read method so name accents correctly imported