In [1]:
from torch import nn
from fastai.tabular.all import * 
from fastcore.utils import *
import pandas as pd
import numpy as np

In [2]:
data = pd.read_pickle('event_data.pickle')

In [3]:
data.head()

Unnamed: 0,time,event,player_sub,main_player,commentary
0,89',Yellow Card,na,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban."
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place."
2,85',Yellow Card,na,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need."
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura."
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.


In [4]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

class Preprocessing:
    
    def __init__(self, data, max_len):
        self.data = data
        self.max_len = max_len
        self.max_words = 500
        
    def prepare_tokens(self):
        self.tokens = Tokenizer(num_words=self.max_words)
        self.tokens.fit_on_texts(self.data)
        
    def sequence_to_token(self, x):
        sequences = self.tokens.texts_to_sequences(x)
        return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [5]:
pp = Preprocessing(data['commentary'], 15)

In [6]:
pp.prepare_tokens()

In [74]:
seq = pp.sequence_to_token(data['commentary'][0])

In [75]:
seq

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [60]:
data['commentary'][0]

'A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.'

In [7]:
cont, cat = cont_cat_split(data, dep_var='commentary')
cont, cat

([], ['time', 'event', 'player_sub', 'main_player'])

In [8]:
procs_nn = [Categorify]
device = torch.device('cpu')
to_nn = TabularPandas(data, procs_nn, cat,
                      y_names='commentary', reduce_memory=False)
dls = to_nn.dataloaders(1024, device=device)

In [9]:
learn = tabular_learner(dls, layers=[1000, 700, 400, 300, 400], n_out=len(data['commentary']))
learn.fit_one_cycle(4, 5e-4)

epoch,train_loss,valid_loss,time
0,8.095457,,00:24
1,3.281805,,00:24
2,1.365951,,00:24
3,0.629997,,00:31


  warn("Your generator is empty.")


In [10]:
def embed_features(learner, x):
    x = x.copy()
    for i, col in enumerate(learn.dls.cat_names):
        embed = learn.model.embeds[i]
        print(col)
        print(x[col])
        embed_data = embed(tensor(x[col], dtype=torch.int64).to(device))
        embed_names = [f'{col}_{j}' for j in range(embed_data.shape[1])]
        features = pd.DataFrame(data=embed_data, index=x.index, columns=embed_names)
        x = x.drop(col, axis=1)
        x = x.join(features)
    return x

In [11]:
embed_xs = embed_features(learn, to_nn.xs)

time
0        74
1        72
2        70
3        62
4        60
         ..
39422    27
39423    26
39424    19
39425    19
39426     3
Name: time, Length: 39427, dtype: int8
event
0        7
1        6
2        7
3        6
4        6
        ..
39422    7
39423    7
39424    1
39425    2
39426    7
Name: event, Length: 39427, dtype: int8
player_sub
0        103
1         39
2        103
3         96
4         40
        ... 
39422    103
39423    103
39424    103
39425    103
39426    103
Name: player_sub, Length: 39427, dtype: int8
main_player
0        103
1         37
2        138
3        111
4        126
        ... 
39422    112
39423     66
39424    180
39425     78
39426    176
Name: main_player, Length: 39427, dtype: int16


In [12]:
embed_xs.head()

Unnamed: 0,time_0,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,time_9,...,main_player_20,main_player_21,main_player_22,main_player_23,main_player_24,main_player_25,main_player_26,main_player_27,main_player_28,main_player_29
0,0.001002,0.007036,0.005984,-0.004348,0.001814,0.008016,0.006264,0.00862,-0.003003,0.001439,...,-0.005906,-0.00815,-0.002877,0.007581,0.018791,0.001031,0.002131,-0.000487,0.003769,0.003684
1,0.010639,-0.016127,-0.009577,-0.00957,-0.009424,-0.01553,0.013682,0.015672,-0.009195,0.006055,...,0.013013,-0.00097,0.002576,0.001886,0.003016,-0.00208,0.009623,-4e-06,-0.009072,-0.00386
2,-0.007014,-0.00099,0.00442,0.011357,-0.002832,-0.009795,0.005411,-0.000264,0.009012,-0.002365,...,-0.015261,0.008452,-0.005777,0.00514,0.002874,0.012239,-0.008042,-0.004935,0.008402,0.001875
3,0.004507,-0.003805,0.016493,-0.007968,-0.010683,-0.003151,-0.010673,0.001802,-0.003902,0.013069,...,-0.003367,0.002342,-0.014807,0.006426,-0.004879,0.018765,0.007695,-0.006071,0.001078,-0.003385
4,0.004794,0.00121,-0.00205,-0.007702,-0.022302,-0.001115,0.002429,-0.011666,-0.013835,-0.012294,...,-0.021521,0.016103,0.016421,-0.004788,0.005815,0.003511,-0.001796,0.011811,-0.011315,0.004347


In [13]:
data['tok_comm'] = data['commentary'].apply(lambda x : pp.sequence_to_token(x))

In [14]:
commentary = data['commentary']
data = data.drop('commentary', axis=1)

In [15]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(embed_xs, data['tok_comm'],
                                          test_size=0.33, random_state=42)

In [16]:
tr_id, te_id = x_te.index, x_te.index
x_tr = torch.FloatTensor(x_tr.values)
x_te = torch.FloatTensor(x_te.values)

In [98]:
"""
    Network Process
"""

fc1 = torch.nn.Linear(x_tr.shape[1], 200)
tanh = torch.nn.Tanh()
fc2 = torch.nn.Linear(200, x_tr.shape[1])
dropout = torch.nn.Dropout(p=0.4)
# batch = torch.nn.BatchNorm1d(x_tr.shape[1])
h = torch.nn.Linear(embed_xs.shape[1], 200)
fc3 = torch.nn.Linear(200, x_tr.shape[0])
# gc = torch.sigmoid(torch.mm(h.weight, h) + h.bias)
lstm1 = torch.nn.LSTM(76, 128)
tags = torch.nn.Linear(128,x_tr.shape[0])

In [99]:
lay1 = fc1(x_tr)
tanh_ = tanh(lay1)
lay2 = fc2(tanh_)
drop = dropout(lay2)
# batch_ = batch(drop)
h_ = h(drop)
sig = torch.nn.Sigmoid()
# gc = sig(torch.mm(h.weight.T, h_.T) + h.bias)
sig = sig(h_)
lay3 = fc3(gc)
lstm_, _ = lstm1(lay3.view(x_tr.shape[0], 1, 76))
# lstm_ = lstm1(sig, y_tr)
t = tags(lstm_)

In [101]:
t.shape

torch.Size([26416, 1, 26416])

In [81]:
lay3.view(x_tr.shape[0], 1, 76).shape

torch.Size([26416, 1, 76])

In [163]:
torch.mm(h_,x_tr[0:2]).shape

torch.Size([2, 76])

In [171]:
torch.mm(h2.weight,torch.mm(h_,x_tr[0:2])).shape

torch.Size([76, 76])

In [175]:
h2.weight[

tensor([0.5562, 0.5094], grad_fn=<SelectBackward>)