**Entity Embeddings of Categorical Variables**
> https://arxiv.org/abs/1604.06737

**Generating Live Soccer-Math commentary from Play Data**
> https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwicl4Xi5J7wAhV7GDQIHW5lAOoQFjACegQIAxAD&url=https%3A%2F%2Fojs.aaai.org%2Findex.php%2FAAAI%2Farticle%2Fview%2F4691%2F4569&usg=AOvVaw0f2DIG3tjdsCeu_CJQ_2qW


In [3]:
!pip install fastai
!pip install fastcore
!pip install nbdev

Collecting fastai
  Downloading fastai-2.3.0-py3-none-any.whl (193 kB)
[K     |████████████████████████████████| 193 kB 2.8 MB/s eta 0:00:01
Collecting fastprogress>=0.2.4
  Downloading fastprogress-1.0.0-py3-none-any.whl (12 kB)
Collecting torchvision<0.9,>=0.8
  Downloading torchvision-0.8.2-cp37-cp37m-manylinux1_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 3.1 MB/s eta 0:00:01
[?25hCollecting torch<1.8,>=1.7.0
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |████████████████████████████████| 776.8 MB 13 kB/s  eta 0:00:012     |███████████████                 | 365.7 MB 3.1 MB/s eta 0:02:14     |███████████████▊                | 382.6 MB 2.6 MB/s eta 0:02:31     |████████████████████▎           | 493.4 MB 2.6 MB/s eta 0:01:49     |███████████████████████         | 556.9 MB 2.7 MB/s eta 0:01:23     |█████████████████████████▎      | 613.6 MB 3.0 MB/s eta 0:00:54     |██████████████████████████████▍ | 738.1 MB 3.1 MB/s eta 0:00

In [1]:
import pandas as pd
import numpy as np
import torch
from fastai.tabular.all import *
from fastcore.utils import *

  return torch._C._cuda_getDeviceCount() > 0


# Load Data

In [2]:
df = pd.read_pickle('event_data.pickle')
df.head()

Unnamed: 0,time,event,player_sub,main_player,commentary
0,89',Yellow Card,,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban."
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place."
2,85',Yellow Card,,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need."
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura."
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.


# Preprocess data

In [3]:
def process_time(time):
    sp = time.strip().split("'")
    minutes = int(sp[0])
    if len(sp[1]) > 0:
        time_to_add = int(sp[1].strip().split('+')[1].strip())
        minutes += time_to_add
    return minutes

df['pr_time'] = df['time'].apply(lambda x : process_time(x))

In [4]:
df.head()

Unnamed: 0,time,event,player_sub,main_player,commentary,pr_time
0,89',Yellow Card,,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.",89
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place.",87
2,85',Yellow Card,,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need.",85
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura.",78
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.,76


In [5]:
# df = df.drop('pr_player_sub', axis=1)
df['player_sub'] = df['player_sub'].replace('', 'na')

In [6]:
df.head()

Unnamed: 0,time,event,player_sub,main_player,commentary,pr_time
0,89',Yellow Card,na,K. Phillips,"A hasty challenge from Kalvin Phillips now and his booking is more costly than usual; it is his fifth, earning him a one-match ban.",89
1,87',Substitution,H. Kane,Carlos Vinícius,"Harry Kane - who became a father this week - makes way for Tottenham's final change, with Carlos Vinicius on his place.",87
2,85',Yellow Card,na,P. Højbjerg,"Hojbjerg picks up a late booking here, as the minutes tick down. Tottenham will be pleased with this though; a winning start to 2021 is just what they need.",85
3,78',Substitution,T. Ndombèlé,Lucas Moura,"Tottenham make their second change now, with Ndombele making way for Lucas Moura.",78
4,76',Substitution,H. Winks,M. Sissoko,Lloris gets a glance to this one and Ayling's return ball sails wide of the mark. Moussa Sissoko replaces Winks for the hosts.,76


In [7]:
df.to_pickle('processd_event_data.pickle')

# Entity Embedding for Categorical Variable

In [12]:
cont, cat = cont_cat_split(df, dep_var='commentary', max_card=12000)
cont, cat

([], ['time', 'event', 'player_sub', 'main_player', 'pr_time'])

In [16]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(df.drop('commentary', axis=1), df['commentary'],
                                          test_size=0.33, random_state=42)

In [18]:
df_train = pd.concat([x_tr,y_tr], axis=1)
df_train

Unnamed: 0,time,event,player_sub,main_player,pr_time,commentary
13278,63',Substitution,A. Mac Allister,Y. Bissouma,63,"Meanwhile, Potter switches to a 3-5-2 as he replaces Mac Allister with an extra midfielder in Bissouma."
31005,44',Yellow Card,na,I. Diallo,44,Diallo pulls back Albrighton on the charge and goes into the book.
21149,42',Yellow Card,na,L. Ayling,42,"Ayling races across to challenge Trossard and he arrives a little late, catching the winger and earning himself a yellow card."
5720,82',Substitution,J. Bowen,A. Yarmolenko,82,"Bowen is replaced by Yarmolenko. Bowen has had a great game today, he's looked a real threat on the wing for the Hammers and looks absolutely shattered as he leaves the pitch."
30994,44',Red Card,na,A. Robinson,44,"FULHAM DOWN TO 10! Cavaleiro loses possession and Robinson tries to win it back. He flies into the challenge on Azpilicueta, completely missing the ball and taking out the right-back. The referee goes straight to his pocket and pulls out the red card."
...,...,...,...,...,...,...
6265,42',Yellow Card,na,L. Ayling,42,"Ayling races across to challenge Trossard and he arrives a little late, catching the winger and earning himself a yellow card."
11284,79',Substitution,Ivan Cavaleiro,J. Onomah,79,First change for Fulham now and it's Cavaleiro that's making way for Onomah.
38158,38',Goal,na,Fábio Silva,38,GOAL! FABIO SILVA LEVELS THINGS FOR WOLVES! 1-1! It is his first top-flight goal at Molineux! The corner sails past but is pumped back in and the striker plays a delightful one-two with Boly before lashing it past Button. All square in the Black Country derby!
860,44',Red Card,na,A. Robinson,44,"FULHAM DOWN TO 10! Cavaleiro loses possession and Robinson tries to win it back. He flies into the challenge on Azpilicueta, completely missing the ball and taking out the right-back. The referee goes straight to his pocket and pulls out the red card."


In [19]:
procs_nn = [Categorify]
device = torch.device('cpu')
to_nn = TabularPandas(df_train, procs_nn, cat,
                      y_names='commentary')
dls = to_nn.dataloaders(1024, device=device)

In [46]:
learn = tabular_learner(dls, layers=[1000, 700, 400, 300, 400], n_out=len(df['commentary']))
learn.fit_one_cycle(4, 5e-4)

epoch,train_loss,valid_loss,time
0,8.828974,,00:16
1,5.499105,,00:16
2,2.822285,,00:16
3,1.574487,,00:16


  warn("Your generator is empty.")


In [48]:
d = {'time': "54'", 'event': "Goal", 'player_sub' : 'na',
     'main_player':"Ronaldo", 'pr_time':54}
ser = pd.Series(data=d, index=['time','event','player_sub','main_player','pr_time'])
preds, *_ = learn.predict(ser)
preds.show()

Unnamed: 0,time,event,player_sub,main_player,pr_time,commentary
0,54',Goal,na,#na#,54,"MOUNT SCORES! Chilwell's cross is palmed away from the goal by Areola, but he gives it straight to Mount, who is unmarked in the middle. He hits the shot on the volley and the keeper can't stop it from hitting the back of the net. 1-0 Chelsea!"


In [58]:
learn.save('learn8')

Path('models/learn8.pth')

In [335]:
def embed_features(learner, x):
    x = x.copy()
    for i, col in enumerate(learn.dls.cat_names):
        embed = learn.model.embeds[i]
        embed_data = embed(tensor(x[col], dtype=torch.int64).to(device))
        embed_names = [f'{col}_{j}' for j in range(embed_data.shape[1])]
        features = pd.DataFrame(data=embed_data, index=x.index, columns=embed_names)
        x = x.drop(col, axis=1)
        x = x.join(features)
    return x

In [336]:
procs = [Categorify]
to = TabularPandas(df, procs, cat, cont, 'commentary')

In [351]:
to_nn

       time  event  player_sub  main_player  commentary
0        74      6           1          103           2
1        72      5          39           37         132
2        70      6           1          138         140
3        62      5          97          111         241
4        60      5          40          126         157
...     ...    ...         ...          ...         ...
39422    27      6           1          112          12
39423    26      6           1           66          60
39424    19      1           1          180         239
39425    19      2           1           78         251
39426     3      6           1          176          85

[39427 rows x 5 columns]

In [270]:
# learn = tabular_learner.load('learn8')
embed_xs = embed_features(learn, to.train.xs)

In [271]:
embed_xs

Unnamed: 0,time_0,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,time_9,...,main_player_20,main_player_21,main_player_22,main_player_23,main_player_24,main_player_25,main_player_26,main_player_27,main_player_28,main_player_29
26396,0.009272,-0.002824,0.003962,-0.006342,0.016476,-0.003549,-0.009164,0.008592,0.018193,-0.014618,...,-0.005709,0.008870,-0.007880,-0.018357,-0.015600,-0.005754,0.007568,0.003455,-0.011277,-0.000204
38745,-0.020504,-0.008770,-0.013708,0.005249,-0.018457,0.012703,0.009251,-0.019604,-0.008307,0.024089,...,-0.000819,0.012906,-0.004641,-0.016956,0.005487,0.003337,-0.001050,0.001424,0.003118,-0.006053
36803,0.001058,0.014992,-0.006306,0.001608,-0.008621,0.003343,0.008123,0.003123,0.000379,0.009782,...,0.013773,0.024786,-0.008647,-0.014300,0.009119,-0.011203,0.010427,0.006670,-0.003991,0.006512
5666,0.000623,0.003393,0.010767,-0.011167,-0.021848,0.013770,0.012681,0.019405,0.004220,0.006290,...,0.024075,0.030655,0.024002,0.007984,-0.010708,0.004334,-0.002106,-0.000689,0.006994,-0.010476
15343,-0.013840,-0.012460,-0.011892,-0.004029,0.005450,0.021677,0.009537,0.001611,0.002201,0.021115,...,0.008763,0.000251,-0.023877,0.018262,0.006965,0.016937,-0.009682,0.004383,0.002513,-0.006790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8416,0.008087,0.008334,-0.005012,0.004421,0.002362,0.013663,0.003883,0.002078,0.004461,-0.020376,...,0.006359,-0.007324,-0.014274,-0.021047,-0.008078,-0.007178,-0.000400,-0.005866,-0.004927,-0.009891
6418,0.002275,-0.010911,-0.005570,0.005220,-0.000660,0.020634,0.007460,-0.004608,-0.009752,0.015232,...,0.003109,-0.004437,0.016766,-0.008146,0.002294,0.015499,0.012244,-0.007887,0.008784,0.008331
18010,0.005397,0.008453,0.007461,0.007038,0.005692,-0.014341,-0.012176,0.002900,-0.005706,0.018476,...,0.006550,0.010458,-0.001705,0.006696,0.020810,0.005401,0.011313,-0.010671,0.017289,0.009141
34531,-0.010301,-0.012759,0.001879,-0.011800,-0.004245,-0.006991,0.001426,0.000601,0.002194,-0.008748,...,-0.014909,-0.009151,-0.005021,0.001174,-0.009815,-0.004380,-0.001192,0.006388,-0.013542,0.003266


In [350]:
to.ys

Unnamed: 0,commentary
0,2
1,132
2,140
3,241
4,157
...,...
39422,12
39423,60
39424,239
39425,251


In [277]:
embed_valid_xs = embed_features(learn, to.valid.xs)

In [278]:
embed_valid_xs

Unnamed: 0,time_0,time_1,time_2,time_3,time_4,time_5,time_6,time_7,time_8,time_9,...,main_player_20,main_player_21,main_player_22,main_player_23,main_player_24,main_player_25,main_player_26,main_player_27,main_player_28,main_player_29
26614,-0.021242,-0.006020,0.003531,0.011634,0.018741,0.007572,0.003504,0.009854,0.016033,0.011028,...,-0.002679,0.000861,0.012142,0.003357,-0.011981,0.012229,-0.002458,0.000768,-0.002417,-0.006615
28533,0.007693,0.004825,-0.010953,-0.013567,0.005793,0.012431,0.005530,0.007038,0.000105,0.004454,...,0.011084,-0.016017,0.001854,0.010596,-0.000100,0.001486,-0.024934,0.000604,-0.019131,0.006365
11568,0.012565,0.003069,-0.005915,-0.013634,0.007143,0.010780,0.009232,-0.003443,-0.002451,0.010082,...,-0.019058,-0.010733,0.017766,-0.003420,0.007331,-0.013142,-0.010556,-0.008791,0.005042,0.004618
7628,0.002883,0.005252,-0.008948,-0.005496,-0.008258,-0.002465,-0.002871,-0.014287,-0.008704,0.005342,...,-0.009660,0.001067,-0.011077,-0.015028,-0.000418,-0.001467,-0.000653,-0.005423,0.001942,0.008168
22395,0.018721,0.010925,-0.018227,-0.028583,-0.006423,0.010003,0.009473,-0.001753,0.016994,-0.000779,...,0.001289,0.006768,-0.000929,0.011695,0.021778,0.003233,-0.003960,-0.014444,-0.019404,0.031912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11142,0.014238,-0.006220,-0.002246,0.001548,-0.009250,0.006653,-0.007739,-0.008992,-0.003056,0.002443,...,0.002878,-0.017461,0.002731,-0.007421,0.015626,0.013654,-0.000473,-0.003548,-0.002914,0.007465
30134,-0.001226,0.002923,0.004506,-0.013412,-0.000674,0.001110,0.017062,0.000702,-0.010121,-0.022211,...,-0.014484,-0.001304,0.011523,-0.006265,-0.002075,-0.001508,-0.016273,-0.003331,0.002709,-0.016223
18378,0.008164,0.002036,-0.002127,0.003153,0.005036,-0.003247,0.007702,-0.006719,0.008519,0.012474,...,0.002599,0.021801,-0.012157,0.006728,-0.005496,0.007274,-0.019366,0.000078,-0.010061,0.009369
15645,0.013241,0.001374,0.001792,-0.017407,-0.005450,-0.002707,-0.024002,-0.012195,-0.007054,0.008149,...,0.005421,-0.001957,0.001166,-0.015784,-0.009983,0.000609,-0.011086,0.004046,-0.013029,-0.007875


In [291]:
# y_embed = embed_features(learn, to.train.ys)
ys = to.ys
show = to.show

In [318]:
to.train.ys.values.ravel()

array([167,  59,  85, ...,  84, 152,  30], dtype=int16)

In [346]:
row, clas, probs = learn.predict(df.iloc[10])
row.show()

Unnamed: 0,time,event,player_sub,main_player,commentary
0,50',Assist,,Son Heung-Min,A round of applause for that back-heel from Boly though. That's the assist of dreams! The dam finally breaks for West Brom. They will have to be on even greater guard now.


In [347]:
clas, probs

(tensor(3),
 tensor([9.0028e-05, 4.4254e-04, 3.4545e-05,  ..., 1.1167e-05, 4.7655e-05,
         1.4618e-05]))

In [68]:
save_pickle('data/emb_xs', embed_xs)

In [69]:
save_pickle('data/emb_valid_xs', embed_valid_xs)

In [10]:
embed_xs = load_pickle('emb_xs')

In [12]:
embed_valid_xs = load_pickle('emb_valid_xs')

# Multi-Layer Perceptron

In [230]:
class Model(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size,
                                   self.hidden_size)
        self.tanh = torch.nn.Tanh()
        self.dropout = torch.nn.Dropout(p=0.5)
        self.fc2 = torch.nn.Linear(self.hidden_size, self.input_size)
        self.batch_norm = torch.nn.BatchNorm1d(self.input_size)
#         self.tanh2 = torch.nn.Tanh()
#         self.fc3 = torch.nn.Linear(self.input_size, 200, bias=True)
        
    def forward(self, x):
        fc1 = self.fc1(x)
        tanh = self.tanh(fc1)
        dropout = self.dropout(tanh)
        fc2 = self.fc2(dropout)
#         batch = self.batch_norm(fc2)
#         tanh = self.tanh2(batch)
#         output = self.fc3(tanh)
#         output = self.softmax(output)
        return fc2

In [15]:
x_train = embed_xs.iloc[:25000]
x_test = embed_xs.iloc[25000:]
# y_train = to.train.ys[:25000]
# y_test = to.train.ys[25000:2

In [16]:
x_train = torch.FloatTensor(x_train.values)
x_test = torch.FloatTensor(x_test.values)
# y_train = torch.FloatTensor(y_train.values)
# y_test = torch.FloatTensor(y_test.values)

In [231]:
model = Model(x_train.shape[1], 500)
criterion = torch.nn.KLDivLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [209]:
x_train.T.shape

torch.Size([77, 25000])

In [232]:
model.eval()

Model(
  (fc1): Linear(in_features=77, out_features=500, bias=True)
  (tanh): Tanh()
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=500, out_features=77, bias=True)
  (batch_norm): BatchNorm1d(77, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [233]:
model.eval()
h = model(x_train)
h.shape

torch.Size([25000, 77])

In [229]:
x_train.shape

torch.Size([25000, 77])

In [227]:
h

tensor([[ 0.4831,  0.4566, -0.0076,  ..., -0.1909, -0.3050,  0.3882],
        [ 0.5071,  0.3313, -0.0269,  ..., -0.3138, -0.3624,  0.3437],
        [ 0.4606,  0.4319, -0.0033,  ..., -0.1805, -0.2931,  0.3690],
        ...,
        [ 0.5133,  0.3685, -0.0353,  ..., -0.3045, -0.3499,  0.3497],
        [ 0.5357,  0.4605, -0.0427,  ..., -0.2724, -0.3356,  0.3835],
        [ 0.5357,  0.4605, -0.0427,  ..., -0.2724, -0.3356,  0.3835]],
       grad_fn=<NativeBatchNormBackward>)

In [234]:
U, b = model.fc2.weight, model.fc2.bias

In [235]:
U.shape

torch.Size([77, 500])

In [236]:
b.shape

torch.Size([77])

In [249]:
"""
        g_c = sigmoid(U_c*h + b_c)
    
    where  U_c is the weight parameter matrix
           b_c is the bias term
           h is the hidden representation
"""

def g(c):
    """
        c : index of the context word 
    """
    U_c = U[:,c].T
#     U_c = torch.reshape(U_c, (U_c.shape[0],1))
    b_c = b[c]
#     print(h.shape, U_c.shape, b_c.shape)
    g_c = torch.matmul(h, U_c) + b_c
#     g_c = g_c.detach().numpy()
    sigmoid = torch.nn.Sigmoid()
#     g_c = 1/(1 + np.exp(-g_c))
    g_c =  sigmoid(g_c)
    return g_c

In [246]:
# g(1).shape

g_c = torch.mm(h, U) #+ b
g_c.shape

torch.Size([25000, 500])

In [252]:
def loss():
    for c in range(x_train.shape[1]):
        g_c = g(c)
        for i in range(x_train.shape[0]):
            g_ck = g_c[i]
            word = x_train[i,c]
            print(word)
            break
        break

In [254]:
x_train[0,0]

tensor(17.)

In [256]:
x_train

tensor([[ 1.7000e+01, -6.9678e-03, -8.7373e-03,  ...,  1.5556e-03,
         -6.4716e-03, -2.5462e-03],
        [ 8.7000e+01, -8.9309e-03,  1.4219e-02,  ..., -1.3334e-02,
         -6.0982e-03, -8.5742e-03],
        [ 1.5000e+01,  1.5483e-03,  5.9269e-03,  ...,  6.8440e-03,
         -9.3233e-03, -3.3145e-04],
        ...,
        [ 7.2000e+01,  3.8889e-03,  1.8350e-02,  ..., -7.4292e-03,
         -7.6149e-04,  1.5664e-03],
        [ 4.4000e+01,  1.4752e-02, -6.6803e-03,  ...,  2.0876e-03,
          9.3981e-03, -7.5252e-03],
        [ 4.4000e+01,  1.4752e-02, -6.6803e-03,  ...,  2.0876e-03,
          9.3981e-03, -7.5252e-03]])