In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import os
from IPython.display import display
%matplotlib inline

In [None]:
#Loading the dataset
events = pd.read_csv('./events.csv')
data_ginf = pd.read_csv('./ginf.csv')
df=data_ginf.merge(events,how='left')
df.head()

In [None]:
df.info()

In [None]:
# Initialize an empty dictionary
new_dict = {}

# Read data from file
with open('./dictionary.txt', 'r') as f:
    data = f.read()

# Split data into sections
sections = data.split('\n\n\n')

# Process each section
for section in sections:
    if section:
        lines = section.split('\n')
        variable_name = lines[0]
        values = lines[1:]
        new_dict[variable_name] = {int(s.split('\t')[0]): s.split('\t')[1] for s in values}
        print(section)

# Assuming df is your DataFrame
for name in new_dict:
    df[name] = df[name].map(new_dict[name])


In [None]:
bundesliga=df[df['country']=='germany']
ligue1=df[df['country']=='france']
laliga=df[df['country']=='spain']
premiereleague=df[df['country']=='england']
seriea=df[df['country']=='italy']
print('Bundes Liga data shape:',bundesliga.shape)
print('Ligue 1 data shape:',ligue1.shape)
print('La Liga data shape:',laliga.shape)
print('Premiere League:',premiereleague.shape)
print('Serie A data shape:',seriea.shape)

In [None]:
def top_scorers(data):
    goals=data.loc[data['is_goal']==1&(data['situation']!='Own goal')] #excluding own goals(we are looking for strikers who can score in the opponent's net)
    goals=goals.groupby('player')['is_goal'].sum().reset_index().rename(columns={'is_goal':'G'}).sort_values(by='G',ascending=False)
    goals=goals[['player','G']].set_index('player')
    return goals
player_tp=top_scorers(df)
print('G : Goals')
player_tp[:20]


In [None]:
import matplotlib.pyplot as plt

def plot_top_scorers(data, top_n=10):
    top_scorers_result = top_scorers(data).head(top_n)

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.bar(top_scorers_result.index, top_scorers_result['G'], color='blue')
    plt.xlabel('Player')
    plt.ylabel('Goals')
    plt.title('Top Scorers')
    plt.xticks(rotation=45, ha='right')  # Adjust rotation for better visibility
    plt.show()

# Call the function with your DataFrame
plot_top_scorers(df, top_n=10)


In [None]:
def GPM(data):
    x=data[data['situation']!='Own goal']
    y=x.groupby(['id_odsp','player'])['is_goal'].sum().reset_index().rename(columns={'id_odsp':'Matches','is_goal':'G'})
    xy=y.groupby('player').agg({'Matches':'count','G':"sum"})
    xy['GPM']=xy['G']/xy['Matches']
    xy=xy[xy['Matches']>xy['Matches'].max()*0.25]
#     print(xy['Matches'].max()*0.25)
    xy.sort_values(by='GPM',ascending=False)
    return xy.sort_values(by='GPM',ascending=False)

print('G : Goals')
print('GPM : Goals Per Match')
player_gpm=GPM(df)
player_gpm[:20]

In [None]:
def twin_barplot(data1,x1,y1,s1,data2,x2,y2,s2):
    plt.figure(figsize=(20,10))

    plt.subplot(121)
    ax=sns.barplot(x=x1,y=y1,data=data1)
    for i,j in enumerate(data1[x1][:20]):
        ax.text(0.5,i,j,weight='bold')
    plt.title(s1)
    plt.ylabel("")
    plt.subplot(122)
    plt.subplots_adjust(wspace=.5)
    ax=sns.barplot(x=x2,y=y2,data=data2)
    for i,j in enumerate(player_gpm[x2][:20]):
        ax.text(0.01,i,j,weight='bold')
    plt.title(s2)
twin_barplot(player_tp[:20],'G',player_tp.index[:20],'Goals',player_gpm[:20],'GPM',player_gpm.index[:20],'Goals Per Match')

In [None]:
def top_scorers_by_league(data):
    # Filter data for goals (is_goal==1) and exclude own goals
    goals = data.loc[(data['is_goal'] == 1) & (data['situation'] != 'Own goal')]

    # Group by country and player, sum the goals
    goals_by_league = goals.groupby(['country', 'player'])['is_goal'].sum().reset_index().rename(columns={'is_goal': 'G'})

    # Sort in descending order based on goals
    goals_by_league = goals_by_league.sort_values(by=['country', 'G'], ascending=[True, False])

    # Get top 5 scorers for each league
    top_scorers_by_league = goals_by_league.groupby('country').head(5).set_index(['country', 'player'])

    return top_scorers_by_league

# Call the function with your DataFrame
top_scorers_by_league = top_scorers_by_league(df)

# Display top 5 scorers for each league
print('Top 5 Scorers for Each League:')
print(top_scorers_by_league)


In [None]:
def NPGPM(data):
    x=data[(data['situation']!='Own goal')&(data['location']!='Penalty spot')]
    y=x.groupby(['id_odsp','player'])['is_goal'].sum().reset_index().rename(columns={'id_odsp':'Matches','is_goal':'NPG'})
#     print(y[y['player']=='sergio aguero'])
    xy=y.groupby('player').agg({'Matches':'count','NPG':"sum"})
    xy['NPGPM']=xy['NPG']/xy['Matches']
    xy=xy[xy['Matches']>31]
#     print(xy['Matches'].max()*0.25)
    
    return xy.sort_values(by='NPGPM',ascending=False)
print('NPG : Non-Penalty Goals')
print('NPGPM : Non-Penalty Goals Per Match')
player_npg=NPGPM(df)
player_npg[:20]


In [None]:
def double_bargraph(data,s):
#     print(data)
    ax=data.plot(kind='barh',figsize=(20,20),edgecolor='k',linewidth=1)
    plt.title(s)
    plt.legend(loc='best',prop={'size':40})
    for i,j in enumerate(data.iloc[:,1]):
        ax.text(0.5,i,j,weight='bold')
    for i,j in enumerate(data.iloc[:,0]):
        ax.text(0.5,i-0.2,j,weight='bold',color='white')
xx=pd.concat([player_tp,player_npg],axis=1).fillna(0)
double_bargraph(xx.sort_values(by='G',ascending=False)[['G','NPG']][:20],'Goals Vs. Non-Penalty Goals')


In [None]:
def ExpG(data):
    x=data[(data['location']!='Penalty spot')&(data['event_type2']!='Own goal')&(data['event_type']=='Attempt')]
    y=x.groupby(['player','id_odsp']).agg({'is_goal':'sum','event_type':'count'}).reset_index()
    y['total']=y['is_goal']/y['event_type']
    y=y.groupby('player').agg({'is_goal':'sum','total':'mean','event_type':'sum','id_odsp':'count'})
    y['total2']=y['event_type']/y['id_odsp']
    y['GPM']=y['is_goal']/y['id_odsp']
    y=y[y['is_goal']>18]
    y.columns=['NPG','Avg GPA','Attempts','Matches','APM','GPM']
    return y
print('NPG : Non-Penalty Goals')
print('Avg GPA : Average Goal Per Attempt')
print('APM : Attempt Per Match')
print('GPM : Goal Per Match')

ExpG(df).sort_values(by='Attempts',ascending=False)[:20]

In [None]:
def bar(data,x,y,s ):
    fig=plt.figure(figsize=(15,15))
    ax=sns.barplot(x=x,y=y,data=data)
    plt.title(s)
    for i,j in enumerate(data[x]):
        ax.text(0.01,i,j,weight='bold')
player_expg=ExpG(df).sort_values(by='Avg GPA',ascending=False)[:20]
bar(player_expg,'Avg GPA',player_expg.index,'Average Goals Per Match')        

In [None]:
def GPL(data,colors,labels):
    plt.figure(figsize=(15,12))
    plt.xticks(list(range(10)))
    plt.xlabel('Goals Per Match')
#     plt.legend(loc='best',prop={'size':40})
    for d,c,s in zip(data,colors,labels):
        d=d.groupby('id_odsp')['is_goal'].sum()
        sns.kdeplot(d,shade=True,color=c,label=s)
        plt.axvline(d.mean(),linestyle='dashed',color=c,label=(s+' Mean'))
#FOR the honor of League winners this year, i changed the colors to be the color of the winner teams shirts
GPL([bundesliga,laliga,ligue1,seriea,premiereleague],['r','w','g','k','b'],['BundesLiga','LaLiga','Ligue1','SerieA','PremiereLeague'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Function to create a scatter plot for top scorers
def pointgraph(data, x, s):
    plt.figure(figsize=(12, 8))
    ax = sns.scatterplot(x=data[x], y=data.index, s=700, alpha=0.7)
    for i, j in enumerate(data[x]):
        ax.text(j - 0.5, i - 0.2, int(j), color='white')
    plt.title(s)
    plt.tight_layout()
    plt.show()

# Function to represent various league statistics
def league_repr(data, n):
    # Extracting relevant statistics
    tp = top_scorers(data)
    gpm = GPM(data)
    npgpm = NPGPM(data)
    xx = pd.concat([tp, npgpm], axis=1).fillna(0)
    expg = ExpG(data)

    # Creating visualizations
    pointgraph(tp[:n], 'G', 'Top Scorers')
    twin_barplot(tp[:n], 'G', tp.index[:n], 'Goals', gpm[:n], 'GPM', gpm.index[:n], 'Goals Per Match')
    double_bargraph(xx[['G', 'NPG']].sort_values(by='G', ascending=False)[:n], 'Goals Vs.Non-Penalty Goals')
    bar(expg.sort_values(by='Avg GPA', ascending=False)[:n], 'Avg GPA',
        expg.sort_values(by='Avg GPA', ascending=False).index[:n], 'Average Goals Per Attempt')

    # Displaying additional statistics
    print('sorted by number of attempts')
    display(expg.sort_values(by='Attempts', ascending=False)[:n])

# Example usage for the Premiere League
premier_league_top_scorers = top_scorers(premiereleague)
league_repr(premiereleague, 20)


In [None]:
league_repr(ligue1,20)

In [None]:
league_repr(seriea,20)

In [None]:
league_repr(laliga,20)

In [None]:
league_repr(bundesliga,20)

In [None]:
import pandas as pd

data1 = pd.read_csv('./ginf.csv')
data = pd.read_csv('./events.csv')


In [None]:
data_shot = data[data.event_type == 1]

In [None]:
messi = (data_shot.player == 'lionel messi')
ronaldo = (data_shot.player == 'cristiano ronaldo')

In [None]:
nb_shot_messi = data_shot.id_odsp[messi].count()
nb_goal_messi = data_shot.id_odsp[messi][data_shot.is_goal == 1].count()
ratio_messi = nb_goal_messi / nb_shot_messi

nb_shot_ronaldo = data_shot.id_odsp[ronaldo].count()
nb_goal_ronaldo = data_shot.id_odsp[ronaldo][data_shot.is_goal == 1].count()
ratio_ronaldo = nb_goal_ronaldo / nb_shot_ronaldo

print('Number of goals for Messi : ', nb_goal_messi)
print('Goal/shot ratio for Messi : ', ratio_messi)
print('Number of goals for Ronaldo : ', nb_goal_ronaldo)
print('Goal/shot ratio for Ronaldo : ', ratio_ronaldo)

In [None]:
print ('Number of shots not located : ', data_shot.is_goal[data.location == 19].count()) 
print ('Split by goal or no goal : ', data_shot.is_goal[data.location == 19].value_counts()) 
print('\
      ')
print('Number of shot recorded', data_shot.is_goal.count())

In [None]:
data_shot = data_shot[data_shot.location != 19]

In [None]:
data_shot.count()

In [None]:
X = data_shot[['time', 'side', 'bodypart', 'location', 'situation', 'assist_method', 'fast_break']]
y = data_shot['is_goal']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, stratify = y)

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    LinearSVC()]


for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
print("="*30)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
print('XGBoost model precision on test dataset : ', model.score(X_test, y_test) * 100)

In [None]:
data_shot.is_goal.value_counts()

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_test, y_test)
score = dummy_clf.score(X_test, y_test)
print('most frequent precision : ', score * 100, '%')

In [None]:
probas = model.predict_proba(X)
data_shot['xgoalpercent'] = probas[:,1] 

In [None]:
print('Maximum xG value for a shot : ', probas[:,1].max())
print('Minimum xG value for a shot : ', probas[:,1].min())

In [None]:
nb_shot = data_shot.id_odsp[messi].count()
print('Lionel Messi :')
print('Number of shots for : ', nb_shot)

print('Expected goals for : ', data_shot[messi]['xgoalpercent'].sum(axis = 0))

nb_goal = data_shot.id_odsp[data_shot.is_goal == 1][messi].count()
print('Number of goals for : ', nb_goal)
print('Difference between goals and xG : ', nb_goal - data_shot[messi]['xgoalpercent'].sum(axis = 0))
print('xG/shots :', data_shot[messi]['xgoalpercent'].sum(axis = 0) / nb_shot)

print('\
    ')

nb_shot = data_shot.id_odsp[ronaldo].count()
print('Cristiano Ronaldo :')
print('Number of shots : ', nb_shot)

print('Expected goals : ', data_shot[ronaldo]['xgoalpercent'].sum(axis = 0))

nb_goal = data_shot.id_odsp[data_shot.is_goal == 1][ronaldo].count()
print('Number of goals : ', nb_goal)
print('Difference between goals and xG : ', nb_goal - data_shot[messi]['xgoalpercent'].sum(axis = 0))
print('xG/shots :', data_shot[messi]['xgoalpercent'].sum(axis = 0) / nb_shot)

In [None]:
list_of_players = data_shot.player.unique()
print(list_of_players)