# NLP with GloVe embeddings

## Set-up

In [2]:
import os
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import nltk
from nltk.tokenize import word_tokenize

In [3]:
os.chdir('..')

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stevengeorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load pre-trained GloVe embeddings

In [5]:
word_to_vector = {}
with open('data/external/GloVe/glove.6B/glove.6B.100d.txt', "r") as f:
    # Each line starts with the word/character followed by the 100d vector representation
    for line in f:  # itertools.islice(f, 5):
        
        # Split by whitespace:
        components = line.split()
        
        word = components[0]
        vector_values = components[1:]
        
        vector_array = np.array(vector_values, dtype=np.float64)  # Convert vector to numpy array
        
        # Add to dictionary
        word_to_vector[word] = vector_array

## Load training and dev data

In [6]:
train = pd.read_parquet('data/processed/train.parquet')
print(train.shape)

dev = pd.read_parquet('data/processed/dev.parquet')
print(dev.shape)

(749640, 13)
(92689, 13)


In [7]:
train.head()

Unnamed: 0,id_odsp,sort_order,time,text,event_type,event_team,opponent,is_goal,assist_method,fast_break,season,country,event_team_was_home
0,UFot0hit/,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Hamburg SV,Borussia Dortmund,0,Pass,0,2012,germany,0
1,UFot0hit/,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",Corner,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
2,UFot0hit/,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",Corner,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
3,UFot0hit/,4,7,Foul by Sven Bender (Borussia Dortmund).,Foul,Borussia Dortmund,Hamburg SV,0,,0,2012,germany,1
4,UFot0hit/,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,Free kick won,Hamburg SV,Borussia Dortmund,0,,0,2012,germany,0


- Remove `time` and `event_team_was_home` for now but may want to add these back as they are not extracted from the text itself

In [8]:
def process_data_for_nlp(df):
    
    df = df.copy()
    
    # Sort by event order
    df.sort_values(['id_odsp', 'sort_order'], inplace=True)
    # Create target
    df['next_event_is_goal'] = df.groupby('id_odsp')['is_goal'].shift(-1)
    # Drop redundant columns
    df.drop(
        columns=['sort_order', 'event_type', 'event_team', 'opponent', 'is_goal', 'assist_method', 'fast_break', 'season', 'country', 'event_team_was_home', 'time'], 
        inplace=True
    )
    return df

In [9]:
train_p = process_data_for_nlp(train)
dev_p = process_data_for_nlp(dev)

In [10]:
train_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal
516547,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,0.0
516548,004f4ING/,Foul by Maya Yoshida (Southampton).,0.0
516549,004f4ING/,Dusan Tadic (Southampton) wins a free kick on ...,0.0
516550,004f4ING/,Foul by Neil Taylor (Swansea City).,0.0
516551,004f4ING/,Attempt saved. James Ward-Prowse (Southampton)...,0.0


In [11]:
dev_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal
61337,00nmICd9/,Foul by Juan Manuel FalcA³n (Metz).,0.0
61338,00nmICd9/,TiemouA© Bakayoko (Monaco) wins a free kick in...,0.0
61339,00nmICd9/,Foul by Anthony Martial (Monaco).,0.0
61340,00nmICd9/,Sylvain Marchal (Metz) wins a free kick in the...,0.0
61341,00nmICd9/,Foul by Cheick Doukoure (Metz).,0.0


## Tokenize text commentary

GloVe we are using is uncased so change all text to lower case:

In [12]:
train_p['text_lowercase'] = train_p['text'].str.lower()

In [13]:
%%time
train_p['text_split'] = train_p['text_lowercase'].apply(lambda x: word_tokenize(x))

CPU times: user 1min 44s, sys: 325 ms, total: 1min 44s
Wall time: 1min 44s


In [14]:
train_p.head()

Unnamed: 0,id_odsp,text,next_event_is_goal,text_lowercase,text_split
516547,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,0.0,bafetimbi gomis (swansea city) wins a free kic...,"[bafetimbi, gomis, (, swansea, city, ), wins, ..."
516548,004f4ING/,Foul by Maya Yoshida (Southampton).,0.0,foul by maya yoshida (southampton).,"[foul, by, maya, yoshida, (, southampton, ), .]"
516549,004f4ING/,Dusan Tadic (Southampton) wins a free kick on ...,0.0,dusan tadic (southampton) wins a free kick on ...,"[dusan, tadic, (, southampton, ), wins, a, fre..."
516550,004f4ING/,Foul by Neil Taylor (Swansea City).,0.0,foul by neil taylor (swansea city).,"[foul, by, neil, taylor, (, swansea, city, ), .]"
516551,004f4ING/,Attempt saved. James Ward-Prowse (Southampton)...,0.0,attempt saved. james ward-prowse (southampton)...,"[attempt, saved, ., james, ward-prowse, (, sou..."


In [15]:
max_length = train_p['text_split'].apply(len).max()
max_length

50

We assume that items which are not in the GloVe matrix are player names

In [16]:
train_p.shape[0]

749640

In [18]:
def zero_pad(unpadded_matrix, max_length):
    
    if unpadded_matrix.shape[1] < max_length:
        num_missing_items = max_length - unpadded_matrix.shape[1]
        zero_pad = np.zeros((unpadded_matrix.shape[0], num_missing_items))

    padded_matrix = np.concatenate((unpadded_matrix, zero_pad), axis=1)
    
    return padded_matrix

In [19]:
m = train_p.shape[0]

train_3d = np.zeros((50, m, 100))  # seq_len, examples, input_size

unknown_items = []  # Keep track of items not in GloVe

for i, row in train_p.iterrows():
    text_split = row['text_split']  # List of items in text commentary
    
    vector_list = []
    for item in text_split:
        try:
            word_vector = word_to_vector[item].reshape(100, 1)  # Get GloVe vector for item
            vector_list.append(word_vector)
        except KeyError:
            unknown_items.append(item)  # Add to unknown list
            word_vector = word_to_vector['name'].reshape(100, 1)  # Substitute unknown item with vector for 'name'
    
    text_commentary_matrix = np.concatenate(vector_list, axis=1)  # Concanenate vectors into a matrix of dim (100, num_items)
    text_commentary_matrix = zero_pad(text_commentary_matrix, max_length=max_length)  # zero-padding to (100, 50)
    
    train_3d[:, i, :] = text_commentary_matrix.T  # Add to 3D input matrix

In [20]:
len(set(unknown_items))

1887

In [21]:
train_3d.shape

(50, 749640, 100)

## Create PyTorch custom dataset

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [22]:
from torch.utils.data import Dataset

In [23]:
class CommentaryDataset(Dataset):

    def __init__(self, numpy_3d):
        self.numpy_3d = numpy_3d

    def __len__(self):
        return self.numpy_3d.shape[1]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.numpy_3d[:, idx, :]

In [24]:
train_comm = CommentaryDataset(numpy_3d=train_3d)

In [25]:
train_comm

<__main__.CommentaryDataset at 0x7f9356885ad0>

## References

https://nlp.stanford.edu/projects/glove/

## Questions

- How to deal with player names when using pretrained weights?

## TODO

- Change CSV read method so name accents correctly imported