In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
##Read in the shots data
df = pd.read_csv(r"https://raw.githubusercontent.com/AbhishekSharma99/undershot_backup/master/understat_all_shots.csv")
df.head()

Unnamed: 0,X,Y,a_goals,a_team,date,h_a,h_goals,h_team,id,lastAction,match_id,minute,player,player_assisted,player_id,result,season,shotType,situation,xG
0,0.707,0.379,0,Hoffenheim,29-08-2015 17:30,h,0,Darmstadt,76737,Aerial,1044,93,György Garics,,2,MissedShots,2015,RightFoot,FromCorner,0.011869
1,0.728,0.373,1,Darmstadt,12-09-2015 17:30,a,0,Bayer Leverkusen,76808,Pass,1053,1,György Garics,Konstantin Rausch,2,SavedShot,2015,RightFoot,SetPiece,0.01598
2,0.016,0.464,0,Darmstadt,01-11-2015 18:30,a,2,VfB Stuttgart,78492,Foul,1118,67,György Garics,,2,OwnGoal,2015,Head,SetPiece,0.0
3,0.875,0.521,2,Darmstadt,20-12-2015 20:30,a,3,Borussia M.Gladbach,79876,Aerial,1173,58,György Garics,,2,MissedShots,2015,Head,FromCorner,0.016684
4,0.927,0.557,2,Werder Bremen,07-12-2014 16:30,a,5,Eintracht Frankfurt,27374,Pass,5320,78,Luca Caldirola,Fin Bartels,3,Goal,2014,LeftFoot,OpenPlay,0.4327


In [3]:
df = df.query("result != 'OwnGoal'") ##Remove own-goals

##Remove Russian League shots
rpl = [
"Zenit St. Petersburg",
"Ural",
"Spartak Moscow",
"Rubin Kazan",
"PFC Sochi",
"Lokomotiv Moscow",
"Krylya Sovetov Samara",
"FK Akhmat",
"FC Ufa",
"FC Tambov",
"FC Rostov",
"FC Orenburg",
"FC Krasnodar",
"Dinamo Moscow",
"CSKA Moscow",
"Arsenal Tula"]

df = df[~df['h_team'].isin(rpl)] 


In [4]:
columns = ["X", "Y", "a_goals", "h_goals", "h_a", "lastAction", "minute", "result", "shotType", "situation"]
df = df[columns] 
dummies = pd.get_dummies(df, columns=["lastAction","result", "shotType", "situation"]) ##One-hot encode everything from the columns above
dummies['h_a'] = dummies["h_a"].apply(lambda x: 0 if x=='h' else 1) ##Home-away to 0,1
dummies["minute"] = dummies["minute"]/90 ##Normalise the timestamp 

In [5]:
dummies.head()

Unnamed: 0,X,Y,a_goals,h_goals,h_a,minute,lastAction_Aerial,lastAction_BallRecovery,lastAction_BallTouch,lastAction_BlockedPass,lastAction_Card,lastAction_Challenge,lastAction_ChanceMissed,lastAction_Chipped,lastAction_Clearance,lastAction_CornerAwarded,lastAction_Cross,lastAction_Dispossessed,lastAction_End,lastAction_Error,lastAction_FormationChange,lastAction_Foul,lastAction_Goal,lastAction_GoodSkill,lastAction_HeadPass,lastAction_Interception,lastAction_KeeperPickup,lastAction_KeeperSweeper,lastAction_LayOff,lastAction_None,lastAction_OffsidePass,lastAction_OffsideProvoked,lastAction_Pass,lastAction_Punch,lastAction_Rebound,lastAction_Save,lastAction_ShieldBallOpp,lastAction_Smother,lastAction_Standard,lastAction_Start,lastAction_SubstitutionOn,lastAction_Tackle,lastAction_TakeOn,lastAction_Throughball,result_BlockedShot,result_Goal,result_MissedShots,result_SavedShot,result_ShotOnPost,shotType_Head,shotType_LeftFoot,shotType_OtherBodyPart,shotType_RightFoot,situation_DirectFreekick,situation_FromCorner,situation_OpenPlay,situation_Penalty,situation_SetPiece
0,0.707,0.379,0,0,0,1.033333,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
1,0.728,0.373,1,0,1,0.011111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
3,0.875,0.521,2,3,1,0.644444,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
4,0.927,0.557,2,5,1,0.866667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0
5,0.899,0.598,2,2,1,0.122222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1


In [6]:
##Predictors

X = dummies[['X', 'Y', 'minute', 'h_goals', 'a_goals', 'lastAction_Standard',
       'situation_DirectFreekick', 'h_a', 'lastAction_Throughball',
       'lastAction_Rebound', 'shotType_RightFoot', 'lastAction_Aerial',
       'lastAction_Cross', 'lastAction_Pass', 'situation_OpenPlay',
       'shotType_Head', 'shotType_LeftFoot', 'situation_FromCorner',
       'lastAction_None', 'lastAction_Chipped', 'situation_SetPiece',
       'lastAction_HeadPass', 'lastAction_TakeOn', 'lastAction_BallTouch',
       'shotType_OtherBodyPart', 'lastAction_BallRecovery',
       'lastAction_LayOff', 'lastAction_Dispossessed']] 

##Results
y = dummies["result_Goal"]       

In [7]:
print(X.shape, y.shape)

(239143, 28) (239143,)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [9]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [10]:
##Model

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(28, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(14, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(7, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1,activation=tf.nn.sigmoid))

model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=[tf.keras.metrics.AUC(name='auc')])
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f862061c950>

In [11]:
val_loss, val_acc = model.evaluate(X_test, y_test)
print(val_loss, val_acc)

0.24108663201332092 0.8643580079078674


In [12]:
predictions = model.predict([X_test])

Consider rewriting this model with the Functional API.


In [13]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,predictions)

0.8645294958480068