In [9]:
import torch
import pandas as pd
import numpy as np

In [393]:
data = pd.read_pickle('event_data.pickle')

In [394]:
data.head()

Unnamed: 0,time,event,player_sub,main_player,commentary
0,89',Yellow Card,na,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban."
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place."
2,85',Yellow Card,na,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need."
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura."
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.


In [395]:
data.iloc[4,:]

time                                                                                                                                      76'
event                                                                                                                            Substitution
player_sub                                                                                                                           H. Winks
main_player                                                                                                                        M. Sissoko
commentary     Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.
Name: 4, dtype: object

In [16]:
class Event_Data(torch.utils.data.Dataset):
    def __init__(self, path_to_pickle, list_idx, target):
        self.data = pd.read_pickle(path_to_pickle)
        self.list_idx = list_idx
        self.target = target
        self.X = self.data.drop(self.target, axis=1)
        self.Y = self.data[self.target]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ID = self.list_idx[idx]
        
        X = self.X.iloc[ID]
        y = self.Y.iloc[ID]
        
        return X,y

In [10]:
train_idx = np.random.randint(0, 30000, size=27000)

In [11]:
device = torch.device('cpu')

In [20]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(data.drop('commentary', axis=1), data['commentary'],
                                          test_size=0.33, random_state=42)

# train_df = Event_Data('processd_event_data.pickle', train_idx, 'commentary')

## Create Word Embeddings

In [30]:
from fastai.tabular.all import *
from fastcore.utils import *

cont, cat = cont_cat_split(data, dep_var='commentary')

procs_nn = [Categorify]
device = torch.device('cpu')
to_nn = TabularPandas(data, procs_nn, cat,
                      y_names='commentary')
dls = to_nn.dataloaders(1024, device=device)

learn = tabular_learner(dls, layers=[1000, 700, 400, 300, 400], n_out=len(data['commentary']))
learn.fit_one_cycle(4, 5e-4)

epoch,train_loss,valid_loss,time
0,8.080174,,00:25
1,3.287371,,00:24
2,1.369253,,00:24
3,0.632966,,00:26


  warn("Your generator is empty.")


In [34]:
def embed_features(learner, to, df):
    embed_dict = {}
    to = to.copy()
    for i, col in enumerate(learn.dls.cat_names):
        print("Processing embedding for column {}".format(col))
        embed = learn.model.embeds[i]
        for j in range(df.shape[0]):
            embed_dict[df[col][j]] = embed(tensor(to[col][j], dtype=torch.int64).to(device))

    return embed_dict

In [35]:
embed_dict = embed_features(learn, to_nn, data)rch

Processing embedding for column time
Processing embedding for column event
Processing embedding for column player_sub
Processing embedding for column main_player


In [336]:
embed_dict['Yellow Card'], embed_dict['Red Card']

(tensor([-0.0095, -0.0011,  0.0071,  0.0162, -0.0072],
        grad_fn=<EmbeddingBackward>),
 tensor([ 0.0038,  0.0175, -0.0015,  0.0043, -0.0007],
        grad_fn=<EmbeddingBackward>))

In [54]:
type(embed_dict['Substitution'])

torch.Tensor

In [88]:
to_nn.cat_names, cont

((#4) ['time','event','player_sub','main_player'], ['pr_time'])

In [160]:
def p_xi(x, to, cols):
    embed_x = pd.Series(index=cols, dtype='object')
    for col in to.cat_names:
        embed_x[col] = embed_dict[x[col]].detach().numpy()
    for col in cont:
        embed_x[col] = x[col]
    for col in to.y_names:
        embed_x[col] = x[col]
    return embed_x

In [90]:
p(data.iloc[0,:], to_nn, data.columns)

time                                                                                                                                                              [0.0034495662, -0.0039111697, -0.0075463657, -0.00084400753, -0.006754394, -0.0064389757, 0.0071556196, 0.0015094089, 0.008235161, 0.0048262253, -0.0007968449, -0.015690666, 0.008338168, -0.002305122, -0.00034313306, 0.0058118557, -0.014901204, -0.00025771974, 0.0005035794]
event                                                                                                                                                                                                                                                                                                                                                                          [-0.009511359, -0.0010568684, 0.007130699, 0.01616274, -0.0072473283]
player_sub                                                                                                                    

In [161]:
p = pd.DataFrame([p_xi(data.iloc[i,:], to_nn, data.columns) for i in range(len(data))])

In [162]:
p.head()

Unnamed: 0,time,event,player_sub,main_player,commentary,pr_time
0,"[0.0034495662, -0.0039111697, -0.0075463657, -0.00084400753, -0.006754394, -0.0064389757, 0.0071556196, 0.0015094089, 0.008235161, 0.0048262253, -0.0007968449, -0.015690666, 0.008338168, -0.002305122, -0.00034313306, 0.0058118557, -0.014901204, -0.00025771974, 0.0005035794]","[-0.009511359, -0.0010568684, 0.007130699, 0.01616274, -0.0072473283]","[0.007230572, 0.0016731989, -0.008829094, 0.008920854, -0.0030575502, -0.010636577, -0.01534504, 0.016586123, 0.0048511606, 0.017197015, -0.011527017, -0.0053833374, 0.005277304, 0.0104470225, -0.0060375524, -0.00046216545, 0.0017174676, -0.0043630023, 0.0021293939, -0.0128255775, -0.015709348, 0.010428692]","[-0.0069870693, -0.0051646424, -0.0011794823, -0.007324319, -0.002198929, -0.0030441142, 0.0066291797, 0.0020430384, 0.02313626, -0.003214444, -0.0121002095, -0.013178531, 0.00028668268, 0.021146901, 0.02319959, -0.004274289, 0.006760166, 0.0042834016, -0.009212152, -0.0008394789, 0.0053389953, -0.0074290936, 0.0060005942, 0.0013890793, 0.02122414, 0.0026654047, 0.012881363, -0.015294595, -0.0010508687, 0.00019637421]","A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.",89
1,"[-0.011621012, 0.012392433, 0.0017693002, 0.0064299973, 0.0057424023, 0.012771621, -0.0018318192, -0.007798832, 0.0050749048, 0.0074686357, 0.011615076, 0.0038090723, 0.01324271, -0.0024024055, 0.0030063528, 0.0065668602, -0.008800555, -0.013174559, -0.00068424013]","[0.0008827144, -0.0019168973, 0.007145642, -0.00048759853, 0.0047499086]","[-0.004435647, 0.005875734, -0.01329946, -0.002099448, 0.008533904, -0.0053840815, 0.00049884454, -0.0013355388, 0.0036161775, 0.010086446, -0.016094016, -0.0020553875, -0.0048411135, -0.011048381, -0.010938119, -0.0067419102, -0.000102650476, -0.000797499, 0.006562047, 0.010153793, 0.01130392, 0.019240316, 0.0076453444, -0.004339522, 0.0031551553, -0.008467184, -0.003491769, -0.0030524803, 0.019709066, -0.013849651]","[0.0009201856, -0.017269863, -0.0015933602, -0.007972267, 0.0040540327, 0.017663514, 0.019291056, -0.005946982, -0.002946117, 0.009359054, -0.0034637502, 0.010648801, -0.0023220477, -0.0006266782, -0.0023512526, 0.0024309964, -0.0022670496, 0.00046189103, 0.02141611, -0.01564967, 0.004248105, -0.0057851234, -0.007909473, 0.0072552236, -0.0018675883, 0.0041767512, -0.0015516944, 0.00035908516, 0.006981004, 0.013180493]","Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place.",87
2,"[-0.00469359, 0.0047177873, 0.0055352603, 0.006129575, 0.01186114, -0.010104363, -0.00295194, 0.0014424023, -0.008374792, -0.016567338, 0.008236578, 0.007971317, -0.018111927, 0.0016413045, -0.015762057, -0.004128613, -0.0066969227, 0.0073460937, 0.0034484067]","[-0.009511359, -0.0010568684, 0.007130699, 0.01616274, -0.0072473283]","[0.007230572, 0.0016731989, -0.008829094, 0.008920854, -0.0030575502, -0.010636577, -0.01534504, 0.016586123, 0.0048511606, 0.017197015, -0.011527017, -0.0053833374, 0.005277304, 0.0104470225, -0.0060375524, -0.00046216545, 0.0017174676, -0.0043630023, 0.0021293939, -0.0128255775, -0.015709348, 0.010428692]","[-0.0009942069, -0.0083088055, 0.004603518, -0.0035380563, -0.0056925896, -0.0051157293, 0.006227409, 0.0038516803, 0.0021254946, 0.0022249818, -0.004978213, 0.0032631774, -0.008528752, -0.011582153, -0.005467155, -0.009760925, 0.007913494, -0.005370656, -0.0046006693, 0.007144398, 0.005827086, 0.0007507539, -0.01685207, -0.009857431, 0.01296837, -0.013914956, 0.0035912772, -0.0090568615, -0.0027956043, -0.012057228]","Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need.",85
3,"[0.0089444285, -0.0024795418, -0.020351885, 0.0012407773, -0.002417905, -0.0066117006, 0.0046422984, 0.0088318, -0.003301167, -0.014725702, 0.007408219, -0.0060086357, -0.0050223265, -0.0047090957, -0.0014562249, 0.005127333, 0.00015306966, 0.0033763, 0.0006759903]","[0.0008827144, -0.0019168973, 0.007145642, -0.00048759853, 0.0047499086]","[-0.0034324247, -0.0019396286, -0.0022038834, 0.0019862857, 0.0067689205, 0.005154672, 0.01641062, -0.00041963262, -0.010665516, 0.006587107, -0.0154437255, 0.0068435385, 0.00470834, -0.00017066565, 0.006482162, -0.0035122668, -0.004809087, -0.016020877, -0.009234772, 0.0017032009, -0.0017214569, -0.0025316987, -0.0023181476, 0.0052922196, -0.008255422, -0.002360449, 0.004632171, 0.016975027, 0.006421529, -0.0051925457]","[-0.01003878, 0.0022597383, -0.002363576, 0.014024183, 0.0071605574, -0.00058340316, -0.010151597, 0.0053068884, -0.0028311918, 0.010240652, 0.007899876, 0.011960734, 0.002062169, -0.007000639, 0.0077446345, 0.0009497138, -0.0068769394, -0.010298149, 3.2159398e-05, -0.008069817, 0.011101558, 0.016110541, -0.0074076736, 0.013378741, 0.0021959287, 0.0009988102, 5.13064e-05, -0.0017681556, -0.0023809574, 0.01362069]","Tottenham make their second change now, with Ndombele making way for Lucas Moura.",78
4,"[0.0089202495, 8.8754474e-05, 0.0047929706, -0.016913377, 0.007475003, -0.005662797, 0.008808478, -0.008236848, 0.0034129722, 0.0174556, -0.0026300766, -0.001378433, -0.009398671, 0.0016954796, 0.006537314, 0.019178096, 0.002164626, 0.0059580747, -0.00025065683]","[0.0008827144, -0.0019168973, 0.007145642, -0.00048759853, 0.0047499086]","[0.008725187, -0.009094689, -0.010279432, 0.009901614, 0.0039169565, -0.0018292674, 0.0086918855, -0.0028349506, 0.013394293, 0.010911023, 0.0026450534, 0.015356486, -0.013570592, 0.008475194, 0.0054233363, 0.011936354, 0.0067334366, 0.005249077, -0.017679615, 0.0026901977, -0.0037255138, -0.010874673, -0.010830536, 0.0079192985, -0.0017048441, -0.008588641, 0.008572544, 0.004866321, -0.0031195695, 0.008125464]","[0.0054608886, 0.0065199276, -0.0089607, 0.0015420194, 0.008799728, -0.009721656, 0.014010737, 0.006988066, -0.007727739, -0.009506973, -0.00012773763, -0.006769145, 0.022198277, 0.0016995219, 0.010892223, 0.0017287275, 0.0013854591, 0.0038922937, 0.009921165, -0.017095508, 0.005953383, -0.009567861, -0.0031522375, 0.0158486, -0.007054536, 8.426949e-05, 0.0002374138, -0.005704489, 0.004844863, -0.01451412]",Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.,76


In [157]:
p

[tensor([-0.0116,  0.0124,  0.0018,  0.0064,  0.0057,  0.0128, -0.0018, -0.0078,
          0.0051,  0.0075,  0.0116,  0.0038,  0.0132, -0.0024,  0.0030,  0.0066,
         -0.0088, -0.0132, -0.0007]),
 tensor([ 0.0009, -0.0019,  0.0071, -0.0005,  0.0047])]

In [225]:
s = 0
for c in p.columns:
    try:
        s += p[c][0].shape[0]
        print("Column {} Size {}".format(c, p[c][0].shape))
    except : continue
s

Column time Size (19,)
Column event Size (5,)
Column player_sub Size (22,)
Column main_player Size (30,)


76

In [180]:
def embed_features_df(learner, x):
    x = x.copy()
    for i, col in enumerate(learn.dls.cat_names):
        embed = learn.model.embeds[i]
        print(col)
        print(x[col])
        embed_data = embed(tensor(x[col], dtype=torch.int64).to(device))
        embed_names = [f'{col}_{j}' for j in range(embed_data.shape[1])]
        features = pd.DataFrame(data=embed_data, index=x.index, columns=embed_names)
        x = x.drop(col, axis=1)
        x = x.join(features)
    return x

In [182]:
embed_df = embed_features_df(learn, to_nn.train.xs)

time
0        74
1        72
2        70
3        62
4        60
         ..
39422    27
39423    26
39424    19
39425    19
39426     3
Name: time, Length: 39427, dtype: int8
event
0        6
1        5
2        6
3        5
4        5
        ..
39422    6
39423    6
39424    1
39425    2
39426    6
Name: event, Length: 39427, dtype: int8
player_sub
0        103
1         38
2        103
3         96
4         39
        ... 
39422    103
39423    103
39424    103
39425    103
39426    103
Name: player_sub, Length: 39427, dtype: int8
main_player
0        103
1         37
2        138
3        111
4        126
        ... 
39422    112
39423     66
39424    180
39425     78
39426    176
Name: main_player, Length: 39427, dtype: int16


In [188]:
embed_df.shape

(39427, 76)

In [193]:
x_train = embed_df.iloc[:25000]
x_test = embed_df.iloc[25000:]
x_train = torch.FloatTensor(x_train.values)
x_test = torch.FloatTensor(x_test.values)

## Multi Layer Perceptron

In [372]:
class Model(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        # custom sigmoid activation function
        self.sig = Custom_Activation()
#         self.dropout = torch.nn.Dropout(p=0.5)
        self.fc2 = torch.nn.Linear(self.hidden_size, self.input_size)
        self.batch_norm = torch.nn.BatchNorm1d(self.input_size)
        self.tanh2 = torch.nn.Tanh()
        self.fc3 = torch.nn.Linear(self.input_size, self.output_size, bias=True)
        self.softmax = torch.nn.Softmax()
    def forward(self, x):
        fc1 = self.fc1(x)
        sig = self.sig(fc1, self.fc1)
        dropout = self.dropout(sig)
        fc2 = self.fc2(dropout)
        batch = self.batch_norm(fc2)
        tanh = self.tanh2(batch)
        output = self.fc3(tanh)
        output = self.softmax(output)
        return output

In [371]:
def custom_loss(targets: Tensor, sentence) -> Tensor:

    def loss(t_ck, g_ck):
        return t_ck*np.log(g_ck) + (1 - t_ck)*np.log(1-g_ck)
    
    """
        Calculate the probability distribution
    """
    U = model.fc3.weight
    b = model.fc3.bias

    g_c = torch.mm(h,U) + b
    sigmiod = torch.nn.Sigmoid()
    g_c = sigmoid(g_c)
    print(g_c.shape)
    total_loss = 0
    wrd = {0 : range(19), 1 : range(19, 24), 2 : range(24, 46), 3 : range(66,76)}
    print(len(targets))
    for c in range(len(targets)):
        sent = sentence[c]
        for k in range(4):
            g_ck = g_c[c][wrd[k]].sum()/len(wrd[k])
#             print(g_ck)
            word = targets[c][wrd[k]]
#             print(word)
            t_ck = 0
            word = word.detach().numpy()
#             print(word)
            for k_,v in embed_dict.items():
                v = v.detach().numpy()
                if np.array_equal(v, word):
#                     print(word, v)
#                     print(k_)
                    word = k_
                    break
#             print(word, type(word))
#             print(word)
            try :
                if word in sent:
                    print(word, sent)    
                    t_ck = 1
#             if k > 0 : print(word)
            except : continue
            l = loss(t_ck, g_ck.detach().numpy())
            print(l)
            total_loss += l
    return -1 * total_loss

def g_c(U_c, b_c, h):
    
    g_c = torch.mm(h,U_c) + b_c
    return torch.sigmoid(g_c)

In [299]:
data['commentary'][0]

'A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.'

In [354]:
custom_loss(x_train[:200], data['commentary'][:200])

torch.Size([25000, 76])
200
-0.694058115889167
-0.6948283710681473
-0.7090648517586233
-0.6940621727070492
-0.6948265799216768
-0.6940590704330705
-0.6948292069375981
-0.7090635194174401
-0.6940577579354381
-0.6948281322484325
-0.6940616954335029
-0.6948257440544219
-0.6940610988418905
-0.6948229976383605
Rodrigo Leeds make that final change quickly, with Pablo Hernandez replacing Rodrigo in the attacking midfield area.
-0.6774808168411255
-0.6940583545250575
-0.6948245499595553
-0.7090641250268488
-0.6940570420283648
-0.6948234752753946
-0.7090621870780319
-0.6940603829324253
-0.6948206094566123
-0.7090637616611596
-0.694057280663999
-0.6948300428077476
-0.7090653362467665
-0.6940579965712432
-0.6948321921913402
-0.7090636405392925
-0.6940589511150327
-0.6948306398582821
-0.7090661841015817
-0.6940612181601845
-0.69482729637988
-0.7090631560519709
-0.6940595477053638
-0.6948293263475767
-0.7090632771737793
-0.6940625306623583
-0.694825863463987
-0.7090637616611596
-0.6940566840750203


393.27418843097024

In [386]:
model = Model(embed_df.shape[1], 256, 76)
criterion = torch.nn.KLDivLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [387]:
model.eval()

Model(
  (fc1): Linear(in_features=76, out_features=256, bias=True)
  (sig): Custom_Activation()
  (fc2): Linear(in_features=256, out_features=76, bias=True)
  (batch_norm): BatchNorm1d(76, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (tanh2): Tanh()
  (fc3): Linear(in_features=76, out_features=76, bias=True)
  (softmax): Softmax(dim=None)
)

In [392]:
h = model(x_train)

RuntimeError: The size of tensor a (76) must match the size of tensor b (256) at non-singleton dimension 1

In [389]:
h.shape

torch.Size([25000, 256])

In [390]:
model.fc1.weight.shape

torch.Size([256, 76])

In [391]:
model.fc1.bias.shape

torch.Size([256])

In [352]:
model.fc3.bias

Parameter containing:
tensor([ 0.0673,  0.0618, -0.0784,  0.0577,  0.0440, -0.0673,  0.0105, -0.0783,
         0.0185, -0.0910, -0.0819,  0.0205,  0.0125, -0.0843,  0.0829,  0.1110,
         0.0569,  0.0924, -0.0845, -0.0174, -0.0053, -0.1097,  0.0819,  0.0418,
         0.1041, -0.0076,  0.0902, -0.1025, -0.0162,  0.0225,  0.0527,  0.0351,
         0.0637,  0.0701,  0.0319, -0.0827,  0.0099,  0.0893,  0.0563,  0.1031,
         0.0805,  0.0158,  0.0608,  0.0673, -0.0657,  0.0406, -0.0964,  0.1060,
        -0.1115, -0.0989, -0.0703, -0.0299, -0.0033, -0.1123, -0.0517,  0.0416,
        -0.1033,  0.0753,  0.0658, -0.0561, -0.0038, -0.0902, -0.0627, -0.0910,
        -0.0003, -0.1028, -0.0633, -0.0816, -0.0432, -0.1071,  0.0907, -0.0597,
        -0.0129, -0.0076,  0.0259, -0.0469], requires_grad=True)

In [202]:
for epoch in range(3):
    
    optimizer.zero_grad()
    

In [176]:
"""
    Get hidden representation of time
"""
# for i in range(len(p)):
x = torch.FloatTensor(p.iloc[:, 0])
print(x.shape)
fc1 = torch.nn.Linear(x.shape[0], 100)(x.T)
tanh = torch.nn.Tanh()(fc1)
dropout = torch.nn.Dropout(p=0.5)(tanh)
fc2 = torch.nn.Linear(100,x.shape[0])(dropout)
batch = torch.nn.BatchNorm1d(x.shape[0])(fc2)
tanh = torch.nn.Tanh()(batch)
output = torch.nn.Linear(x.shape[0],100)(tanh)
# output = torch.nn.Softmax()(output)

torch.Size([39427, 19])


In [340]:
np.dot([1,23,45,6], [

AttributeError: 'list' object has no attribute 'T'