In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ReduceLROnPlateau

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
data=pd.read_csv('archive/events.csv')

In [6]:
data.head()

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,3,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,8,,2,Hamburg SV,Borussia Dortmund,...,,,,,0,2.0,,0,,0


In [7]:
data.columns

Index(['id_odsp', 'id_event', 'sort_order', 'time', 'text', 'event_type',
       'event_type2', 'side', 'event_team', 'opponent', 'player', 'player2',
       'player_in', 'player_out', 'shot_place', 'shot_outcome', 'is_goal',
       'location', 'bodypart', 'assist_method', 'situation', 'fast_break'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,sort_order,time,event_type,event_type2,side,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
count,941009.0,941009.0,941009.0,214293.0,941009.0,227459.0,228498.0,941009.0,467067.0,229185.0,941009.0,229137.0,941009.0
mean,53.858826,49.663663,4.326575,12.233764,1.48117,5.733693,1.926555,0.025978,6.209073,1.624831,0.264332,1.281316,0.004876
std,32.014268,26.488977,2.995313,0.46885,0.499646,3.3261,0.797055,0.159071,5.421736,0.7404,0.655501,0.709394,0.069655
min,1.0,0.0,1.0,12.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
25%,27.0,27.0,2.0,12.0,1.0,2.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0
50%,53.0,51.0,3.0,12.0,1.0,5.0,2.0,0.0,3.0,1.0,0.0,1.0,0.0
75%,79.0,73.0,8.0,12.0,2.0,9.0,3.0,0.0,11.0,2.0,0.0,1.0,0.0
max,180.0,100.0,11.0,15.0,2.0,13.0,4.0,1.0,19.0,3.0,4.0,4.0,1.0


In [9]:
data.count()

id_odsp          941009
id_event         941009
sort_order       941009
time             941009
text             941009
event_type       941009
event_type2      214293
side             941009
event_team       941009
opponent         941009
player           880009
player2          291310
player_in         51715
player_out        51738
shot_place       227459
shot_outcome     228498
is_goal          941009
location         467067
bodypart         229185
assist_method    941009
situation        229137
fast_break       941009
dtype: int64

# Dictionary for the dataset
<p>event_type</p>
0	Announcement
1	Attempt
2	Corner
3	Foul
4	Yellow card
5	Second yellow card
6	Red card
7	Substitution
8	Free kick won
9	Offside
10	Hand ball
11	Penalty conceded


<p>event_type2</p>
12	Key Pass
13	Failed through ball
14	Sending off
15	Own goal


<p>side</p>
1	Home
2	Away


<p>shot_place</p>
1	Bit too high
2	Blocked
3	Bottom left corner
4	Bottom right corner
5	Centre of the goal
6	High and wide
7	Hits the bar
8	Misses to the left
9	Misses to the right
10	Too high
11	Top centre of the goal
12	Top left corner
13	Top right corner


<p>shot_outcome</p>
1	On target
2	Off target
3	Blocked
4	Hit the bar


<p>location</p>
1	Attacking half
2	Defensive half
3	Centre of the box
4	Left wing
5	Right wing
6	Difficult angle and long range
7	Difficult angle on the left
8	Difficult angle on the right
9	Left side of the box
10	Left side of the six yard box
11	Right side of the box
12	Right side of the six yard box
13	Very close range
14	Penalty spot
15	Outside the box
16	Long range
17	More than 35 yards
18	More than 40 yards
19	Not recorded


<p>bodypart</p>
1	right foot
2	left foot
3	head


<p>assist_method</p>
0	None
1	Pass
2	Cross
3	Headed pass
4	Through ball


<p>situation</p>
1	Open play
2	Set piece
3	Corner
4	Free kick


In [10]:
#picking out the goal attempts from the data
shot_attempts=data[data.event_type==1]

In [11]:
shot_attempts.head()

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
11,UFot0hit/,UFot0hit12,12,14,Attempt missed. Shinji Kagawa (Borussia Dortmu...,1,12.0,1,Borussia Dortmund,Hamburg SV,...,,,13.0,2.0,0,15.0,1.0,1,1.0,0
13,UFot0hit/,UFot0hit14,14,17,"Goal! Borussia Dortmund 1, Hamburg 0. Kevin G...",1,12.0,1,Borussia Dortmund,Hamburg SV,...,,,4.0,1.0,1,9.0,2.0,1,1.0,0
14,UFot0hit/,UFot0hit15,15,19,Attempt blocked. Mats Hummels (Borussia Dortmu...,1,,1,Borussia Dortmund,Hamburg SV,...,,,2.0,3.0,0,15.0,1.0,0,1.0,0
17,UFot0hit/,UFot0hit18,18,20,Attempt blocked. Tomas Rincon (Hamburg) right ...,1,,2,Hamburg SV,Borussia Dortmund,...,,,2.0,3.0,0,15.0,1.0,0,1.0,0


In [12]:
shot_attempts.count()

id_odsp          229135
id_event         229135
sort_order       229135
time             229135
text             229135
event_type       229135
event_type2      168560
side             229135
event_team       229135
opponent         229135
player           229122
player2          167798
player_in             0
player_out            0
shot_place       227452
shot_outcome     228498
is_goal          229135
location         229135
bodypart         229135
assist_method    229135
situation        229135
fast_break       229135
dtype: int64

In [13]:
print("percentage of events used : ",(229135/941009)*100)

percentage of events used :  24.349926515049273


In [14]:
print(shot_attempts.iloc[1])

id_odsp                                                  UFot0hit/
id_event                                                UFot0hit12
sort_order                                                      12
time                                                            14
text             Attempt missed. Shinji Kagawa (Borussia Dortmu...
event_type                                                       1
event_type2                                                   12.0
side                                                             1
event_team                                       Borussia Dortmund
opponent                                                Hamburg SV
player                                               shinji kagawa
player2                                                mario gotze
player_in                                                      NaN
player_out                                                     NaN
shot_place                                                    

In [15]:
shinji=(shot_attempts.player=='shinji kagawa')

In [16]:
total_shots_by_shinji=shot_attempts.id_odsp[shinji].count()
total_goal_by_shinji=shot_attempts.id_odsp[shinji][shot_attempts.is_goal==1].count()

In [17]:
print("Number of attempts by shinji = ", total_shots_by_shinji)
print("Number of goals by shinji = ", total_goal_by_shinji)
print("Ratio = ", total_goal_by_shinji/total_shots_by_shinji)

Number of attempts by shinji =  151
Number of goals by shinji =  25
Ratio =  0.16556291390728478


In [18]:
shot_attempts.is_goal[shot_attempts.location ==19].count()

1450

In [19]:
shot_attempts.is_goal[shot_attempts.location ==19].value_counts()

is_goal
1    1438
0      12
Name: count, dtype: int64

In [20]:
(shot_attempts.is_goal==1).value_counts()

is_goal
False    204694
True      24441
Name: count, dtype: int64

In [21]:
print("Ratio of goals and misses",24441/204694 )

Ratio of goals and misses 0.1194026204969369


In [22]:
shot_attempts=shot_attempts[shot_attempts.location!=19]

In [23]:
shot_attempts.count()

id_odsp          227685
id_event         227685
sort_order       227685
time             227685
text             227685
event_type       227685
event_type2      167859
side             227685
event_team       227685
opponent         227685
player           227684
player2          167798
player_in             0
player_out            0
shot_place       226677
shot_outcome     227685
is_goal          227685
location         227685
bodypart         227685
assist_method    227685
situation        227685
fast_break       227685
dtype: int64

In [24]:
shot_attempts.isna().sum()

id_odsp               0
id_event              0
sort_order            0
time                  0
text                  0
event_type            0
event_type2       59826
side                  0
event_team            0
opponent              0
player                1
player2           59887
player_in        227685
player_out       227685
shot_place         1008
shot_outcome          0
is_goal               0
location              0
bodypart              0
assist_method         0
situation             0
fast_break            0
dtype: int64

In [25]:
shot_attempts.dtypes

id_odsp           object
id_event          object
sort_order         int64
time               int64
text              object
event_type         int64
event_type2      float64
side               int64
event_team        object
opponent          object
player            object
player2           object
player_in         object
player_out        object
shot_place       float64
shot_outcome     float64
is_goal            int64
location         float64
bodypart         float64
assist_method      int64
situation        float64
fast_break         int64
dtype: object

In [26]:
x=shot_attempts[['time', 'side', 'bodypart', 'location', 'situation', 'assist_method', 'fast_break']]
y=shot_attempts['is_goal']

In [27]:
x.head(3)

Unnamed: 0,time,side,bodypart,location,situation,assist_method,fast_break
0,2,2,2.0,9.0,1.0,1,0
11,14,1,1.0,15.0,1.0,1,0
13,17,1,2.0,9.0,1.0,1,0


In [28]:
encode = ['side', 'bodypart', 'location', 'situation', 'assist_method', 'fast_break']

In [29]:
transformers = [
    ('onehot', OneHotEncoder(sparse_output=False), encode)
]
encoder = ColumnTransformer(transformers=transformers,remainder='passthrough')

In [30]:
on_hot=encoder.fit_transform(x)

In [31]:
on_hot_df=pd.DataFrame(on_hot)

In [32]:
on_hot_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,14.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,17.0


In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, stratify = y)
x_train_on, x_test_on, y_train_on, y_test_on = train_test_split(on_hot_df, y, train_size = 0.8, stratify = y)


In [35]:
print(x_train.shape)
print(x_train_on.shape)

(182148, 7)
(182148, 31)


In [36]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [37]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    LinearSVC()
]

In [38]:
for clf in classifiers:
    clf.fit(x_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 89.8742%
RandomForestClassifier
****Results****
Accuracy: 89.8917%
AdaBoostClassifier
****Results****
Accuracy: 90.9788%
GradientBoostingClassifier
****Results****
Accuracy: 90.9854%
GaussianNB
****Results****
Accuracy: 89.2637%
LinearDiscriminantAnalysis
****Results****
Accuracy: 89.4152%
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 88.9123%
KNeighborsClassifier
****Results****
Accuracy: 89.8237%
LinearSVC
****Results****
Accuracy: 89.8961%


In [39]:
for clf in classifiers:
    clf.fit(x_train_on, y_train_on)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(x_test_on)
    acc = accuracy_score(y_test_on, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 89.8720%
RandomForestClassifier
****Results****
Accuracy: 89.7248%
AdaBoostClassifier
****Results****
Accuracy: 90.9195%
GradientBoostingClassifier
****Results****
Accuracy: 90.9436%
GaussianNB
****Results****
Accuracy: 62.6743%
LinearDiscriminantAnalysis
****Results****
Accuracy: 90.5044%
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 14.4652%
KNeighborsClassifier
****Results****
Accuracy: 89.8742%
LinearSVC
****Results****
Accuracy: 90.8338%


In [40]:
model1 = Sequential()
model1.add(Dense(24,activation='relu',input_shape=(7,)))
model1.add(Dense(100,activation='relu'))
model1.add(Dense(75,activation='relu'))
model1.add(Dense(20,activation='relu'))
model1.add(Dense(1,activation='sigmoid'))

In [41]:
go=tf.keras.optimizers.legacy.Adam()
model1.compile(optimizer=go,loss='binary_crossentropy',metrics=['accuracy'])
red_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.3,patience=3,min_lr=0.0000000000001)

In [42]:
model_history=model1.fit(x_train,y_train,epochs=45,validation_split=0.10,verbose=1,callbacks=red_lr)

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [69]:
print("="*30)
print("Neural Net non encoded")
    
print('****Results****')
predictions = model1.predict(x_test)
threshold = 0.47
binary_predictions = (predictions > threshold).astype(int)
accuracy = accuracy_score(y_test, binary_predictions)

print("Accuracy: {:.4%}".format(accuracy))
    
print("="*30)

Neural Net non encoded
****Results****
Accuracy: 90.9239%


In [44]:
model_on = Sequential()
model_on.add(Dense(62,activation='relu',input_shape=(31,)))
model_on.add(Dense(128,activation='relu'))
model_on.add(Dense(50,activation='relu'))
model_on.add(Dense(10,activation='relu'))
model_on.add(Dense(1,activation='sigmoid'))

In [45]:
go=tf.keras.optimizers.legacy.Adam()
model_on.compile(optimizer=go,loss='binary_crossentropy',metrics=['accuracy'])
red_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.3,patience=3,min_lr=0.0000000000001)

In [46]:
model_history_on=model_on.fit(x_train_on,y_train_on,validation_split=0.1,epochs=25,verbose=1,callbacks=red_lr)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [59]:
print("="*30)
print("Neural Net encoded")
    
print('****Results****')
predictions = model_on.predict(x_test_on)
threshold = 0.47
binary_predictions = (predictions > threshold).astype(int)
accuracy = accuracy_score(y_test_on, binary_predictions)

print("Accuracy: {:.4%}".format(accuracy))
    
print("="*30)

Neural Net encoded
****Results****
Accuracy: 90.9348%


## 

In [48]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
print('XGBoost model precision on test dataset : ', model.score(x_test, y_test) * 100)

XGBoost model precision on test dataset :  90.98535257043723


In [49]:
probas = model.predict_proba(x)
shot_attempts['xgoalpercent'] = probas[:,1] 

In [50]:
list_of_players = shot_attempts.player.unique()
print(list_of_players)

['mladen petric' 'shinji kagawa' 'kevin grosskreutz' ... 'alhassan wakaso'
 'roman zozulia' 'alessandro bastoni']


In [51]:
comparison = pd.DataFrame()

In [52]:
for i in range(list_of_players.size):

    player = list_of_players[i]
    nb_shot = shot_attempts.id_odsp[shot_attempts.player == player].count()
    xg = shot_attempts[shot_attempts.player == player]['xgoalpercent'].sum(axis = 0)
    nb_goal = shot_attempts.id_odsp[shot_attempts.is_goal == 1][shot_attempts.player == player].count()
    xg_dif = nb_goal - xg
    nb_head = shot_attempts.id_odsp[shot_attempts.player == player][shot_attempts.bodypart == 3].count()
    xg_head = shot_attempts[shot_attempts.player == player][shot_attempts.bodypart == 3]['xgoalpercent'].sum(axis = 0)
    nb_goal_head = shot_attempts.id_odsp[shot_attempts.is_goal == 1][shot_attempts.bodypart == 3][shot_attempts.player == player].count()
    head_xg_dif = nb_goal_head - xg_head
    xg_per_shot = xg / nb_shot
    nb_counter_goals = shot_attempts.id_odsp[shot_attempts.is_goal == 1][shot_attempts.fast_break == 1][shot_attempts.player == player].count()  
    pct_count_goals = nb_counter_goals / nb_goal * 100
    com = pd.DataFrame([{'player' : player, 'nb_shots' : nb_shot, 'expected_goals' : xg, 'goals_scored' : nb_goal, 'xg_dif' : xg_dif, \
                                   'nb_headers' : nb_head, 'expected_head_goals' : xg_head, 'head_goals_scored' : nb_goal_head, 'head_xg_dif' : head_xg_dif, \
                                   'xg_per_shot' : xg_per_shot, 'pct_goals_counter' : pct_count_goals}])
    comparison = pd.concat([comparison, com], ignore_index=True)

In [53]:
comparison

Unnamed: 0,player,nb_shots,expected_goals,goals_scored,xg_dif,nb_headers,expected_head_goals,head_goals_scored,head_xg_dif,xg_per_shot,pct_goals_counter
0,mladen petric,62,8.128841,5,-3.128841,8,1.079347,0,-1.079347,0.131110,20.000000
1,shinji kagawa,151,17.205218,25,7.794782,13,1.317911,2,0.682089,0.113942,20.000000
2,kevin grosskreutz,110,10.789076,9,-1.789076,10,0.853300,0,-0.853300,0.098083,11.111111
3,mats hummels,120,17.970696,9,-8.970696,80,11.126770,5,-6.126770,0.149756,0.000000
4,tomas rincon,98,5.358366,3,-2.358366,2,0.121599,0,-0.121599,0.054677,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
4721,jose gimenez,1,0.076908,0,-0.076908,1,0.076908,0,-0.076908,0.076908,
4722,mame thiam,1,0.094157,0,-0.094157,0,0.000000,0,0.000000,0.094157,
4723,alhassan wakaso,1,0.108074,0,-0.108074,1,0.108074,0,-0.108074,0.108074,
4724,roman zozulia,3,0.272460,0,-0.272460,3,0.272460,0,-0.272460,0.090820,


In [54]:
comparison.to_csv('result.csv')