In [3]:
from torch import nn
from fastai.tabular.all import * 
from fastcore.utils import *
import pandas as pd
import numpy as np

In [84]:
data = pd.read_pickle('event_data.pickle')

In [85]:
data.head()

Unnamed: 0,time,event,player_sub,main_player,commentary
0,89',Yellow Card,na,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban."
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place."
2,85',Yellow Card,na,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need."
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura."
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.


In [86]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

class Preprocessing:
    
    def __init__(self, data, max_len):
        self.data = data
        self.max_len = max_len
        self.max_words = 500
        
    def prepare_tokens(self):
        self.tokens = Tokenizer(num_words=self.max_words)
        self.tokens.fit_on_texts(self.data)
        
    def sequence_to_token(self, x):
        sequences = self.tokens.texts_to_sequences(x)
        return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [87]:
pp = Preprocessing(data['commentary'], 15)

In [88]:
pp.prepare_tokens()

In [74]:
seq = pp.sequence_to_token(data['commentary'][0])

In [75]:
seq

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [60]:
data['commentary'][0]

'A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.'

In [89]:
cont, cat = cont_cat_split(data, dep_var='commentary')
cont, cat

([], ['time', 'event', 'player_sub', 'main_player'])

In [90]:
procs_nn = [Categorify]
device = torch.device('cpu')
to_nn = TabularPandas(data, procs_nn, cat,
                      y_names='commentary', reduce_memory=False)
dls = to_nn.dataloaders(1024, device=device)

In [91]:
learn = tabular_learner(dls, layers=[1000, 700, 400, 300, 400], n_out=len(data['commentary']))
learn.fit_one_cycle(4, 5e-4)

epoch,train_loss,valid_loss,time
0,8.042166,,00:28
1,3.250052,,00:28
2,1.352558,,00:45
3,0.624793,,00:45


  warn("Your generator is empty.")


In [92]:
def embed_features(learner, x):
    x = x.copy()
    for i, col in enumerate(learn.dls.cat_names):
        embed = learn.model.embeds[i]
        print(col)
        print(x[col])
        embed_data = embed(tensor(x[col], dtype=torch.int64).to(device))
        embed_names = [f'{col}_{j}' for j in range(embed_data.shape[1])]
        features = pd.DataFrame(data=embed_data, index=x.index, columns=embed_names)
        x = x.drop(col, axis=1)
        x = x.join(features)
    return x

In [94]:
embed_xs = embed_features(learn, to_nn.xs)

time
0        74
1        72
2        70
3        62
4        60
         ..
39422    27
39423    26
39424    19
39425    19
39426     3
Name: time, Length: 39427, dtype: int8
event
0        7
1        6
2        7
3        6
4        6
        ..
39422    7
39423    7
39424    1
39425    2
39426    7
Name: event, Length: 39427, dtype: int8
player_sub
0        103
1         39
2        103
3         96
4         40
        ... 
39422    103
39423    103
39424    103
39425    103
39426    103
Name: player_sub, Length: 39427, dtype: int8
main_player
0        103
1         37
2        138
3        111
4        126
        ... 
39422    112
39423     66
39424    180
39425     78
39426    176
Name: main_player, Length: 39427, dtype: int16


In [96]:
embed_xs.head()

Unnamed: 0,time_0,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,time_9,...,main_player_20,main_player_21,main_player_22,main_player_23,main_player_24,main_player_25,main_player_26,main_player_27,main_player_28,main_player_29
0,-0.020288,0.003959,-0.01166,-0.00113,0.01024,-0.003644,-0.000574,0.009377,-0.016221,-0.006467,...,-0.016803,0.004418,0.00949,0.007831,-0.005404,0.012244,0.007176,0.00689,-0.00866,-0.006264
1,0.004089,-0.001297,-0.009571,-0.007007,0.002438,-0.002379,-0.000666,-0.008988,0.006091,-0.015454,...,-0.004803,-0.013276,-0.009412,-0.00077,0.009372,-0.004189,-0.006955,0.011875,-0.010172,0.002468
2,-0.01349,-0.010492,-0.009037,0.00632,-0.007285,-0.00504,0.003885,0.01727,0.022771,-0.005183,...,-0.008552,0.006543,0.002609,-0.006397,-0.001587,0.003737,-0.01476,-0.006371,0.005499,0.006144
3,-0.006778,0.004057,-0.00831,-0.012266,0.011967,0.003065,-0.015615,-0.010273,-0.008508,-0.005554,...,-0.002602,0.005265,0.004084,0.003608,-0.011681,0.008063,0.012082,-0.004978,-0.016104,0.018137
4,-0.012479,0.016021,0.006382,0.001375,0.006784,0.005326,-0.004663,0.003616,0.002764,-0.000717,...,0.002206,-0.008704,0.010753,0.001466,0.002518,0.006115,-0.008925,-0.000543,-0.002836,-0.025285


In [99]:
data['tok_comm'] = data['commentary'].apply(lambda x : pp.sequence_to_token(x))

In [78]:
commentary = data['commentary']
data = data.drop('commentary', axis=1)

In [101]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(embed_xs, data['tok_comm'],
                                          test_size=0.33, random_state=42)

In [209]:
"""
    Network Process
"""

# x_tr = torch.FloatTensor(x_tr.values)
# x_te = torch.FloatTensor(x_te.values)

fc1 = torch.nn.Linear(x_tr.shape[1], 200)
tanh = torch.nn.Tanh()
fc2 = torch.nn.Linear(200, x_tr.shape[1])
dropout = torch.nn.Dropout(p=0.4)
# batch = torch.nn.BatchNorm1d(x_tr.shape[1])
h = torch.nn.Linear(embed_xs.shape[1], 276)
h2 = torch.nn.Linear(276, 1)
print(h)
# gc = torch.sigmoid(torch.mm(h.weight, h) + h.bias)

Linear(in_features=76, out_features=276, bias=True)


In [212]:
lay1 = fc1(x_tr[0])
tanh_ = tanh(lay1)
lay2 = fc2(tanh_)
drop = dropout(lay2)
# batch_ = batch(drop)
h_ = h(drop)
sig = torch.nn.Sigmoid()
gc = sig(torch.mm(h.weight.T, torch.reshape(h_, (len(h_),1))) + h.bias)


In [211]:
h2.weight.shape, h_.shape, x_tr[0].shape

(torch.Size([1, 276]), torch.Size([276]), torch.Size([76]))

In [194]:
torch.reshape(h_, (len(h_),1)).shape

torch.Size([200, 1])

In [163]:
torch.mm(h_,x_tr[0:2]).shape

torch.Size([2, 76])

In [171]:
torch.mm(h2.weight,torch.mm(h_,x_tr[0:2])).shape

torch.Size([76, 76])

In [175]:
h2.weight[

tensor([0.5562, 0.5094], grad_fn=<SelectBackward>)