### FIFA Player Performance Prediction and Analysis

# Load the datasets

In [None]:
import pandas as pd

data1 = pd.read_csv('./ginf.csv')
data = pd.read_csv('./events.csv')


# Display the first few rows of each dataset

In [None]:
print("Data1:")
print(data1.head())

print("\nData:")
print(data.head())


# Display information about the datasets

In [None]:
print("Info for Data1:")
print(data1.info())

print("\nInfo for Data:")
print(data.info())



# Load the dictionary from the text file 
# Create a dictionary for event_type and event_type2
# Create dictionaries for side, shot_place, shot_outcome, location, bodypart, assist_method, and situation

In [None]:
with open("./dictionary.txt", "r") as file:
    dictionary_text = file.read()


event_type_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[2:17] if line.split('\t')[0].isdigit()}
event_type2_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[18:23] if line.split('\t')[0].isdigit()}

side_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[24:26] if line.split('\t')[0].isdigit()}
shot_place_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[26:39] if line.split('\t')[0].isdigit()}
shot_outcome_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[39:43] if line.split('\t')[0].isdigit()}
location_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[43:62] if line.split('\t')[0].isdigit()}
bodypart_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[62:65] if line.split('\t')[0].isdigit()}
assist_method_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[65:70] if line.split('\t')[0].isdigit()}
situation_dict = {int(line.split('\t')[0]): line.split('\t')[1] for line in dictionary_text.split('\n')[70:74] if line.split('\t')[0].isdigit()}


# Group data1 by team and sum the goals for each team and Display the top ten teams

In [None]:
# Group data1 by team and sum the goals for each team
team_goals_data1 = data1.groupby('ht')['fthg'].sum().reset_index()

# Rename columns for clarity
team_goals_data1.columns = ['Team', 'TotalGoals']

# Sort the DataFrame by total goals in descending order
team_goals_data1 = team_goals_data1.sort_values(by='TotalGoals', ascending=False)

# Display the top ten teams
top_ten_teams_data1 = team_goals_data1.head(10)
print(top_ten_teams_data1)


# Top Ten Teams with Highest Number of Goals Scored

In [None]:
import matplotlib.pyplot as plt

labels = top_ten_teams_data1['Team']
sizes = top_ten_teams_data1['TotalGoals']

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Top Ten Teams with Highest Number of Goals Scored')
plt.show()


# Filter data for events where attempts were made but didn't result in goals (event_type == 1 and is_goal == 0) and Display the top teams with the most missed chances.

In [None]:
missed_chances = data[(data['event_type'] == 1) & (data['is_goal'] == 0)]

# Group data by team and count the number of missed chances for each team
teams_missed_chances = missed_chances.groupby('event_team')['is_goal'].count().reset_index()

# Sort the DataFrame by the number of missed chances in descending order
teams_missed_chances = teams_missed_chances.sort_values(by='is_goal', ascending=False)

top_teams_missed_chances = teams_missed_chances.head(10)
print(top_teams_missed_chances)


# Top Teams with the Most Missed Chances

In [None]:
import matplotlib.pyplot as plt

teams = top_teams_missed_chances['event_team']
missed_chances_counts = top_teams_missed_chances['is_goal']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(teams, missed_chances_counts, color='yellow')
plt.xlabel('Team')
plt.ylabel('Missed Chances Count')
plt.title('Top Teams with the Most Missed Chances')
plt.xticks(rotation=45, ha='right')
plt.show()


# Comparison of Missed Chances and Goals Scored for Top Teams

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


data_goals_scored = {'event_team': ['Real Madrid', 'Barcelona', 'Bayern Muncih', 'Juventus', 'AC Milan'],
                     'is_goal_scored': [361, 354, 325, 298, 223]}
top_teams_goals = pd.DataFrame(data_goals_scored)


data_missed_chances = {'event_team': ['Real Madrid', 'Barcelona', 'Bayern Muncih', 'Juventus', 'AC Milan'],
                       'is_goal_missed': [3152, 2636, 2656, 3079, 2756]}
top_teams_missed_chances = pd.DataFrame(data_missed_chances)

# Merge the DataFrames on the 'event_team' column
comparison_data = pd.merge(top_teams_missed_chances, top_teams_goals, on='event_team')

# Create a bar chart with missed chances and goals scored for each team
teams = comparison_data['event_team']
missed_chances_counts = comparison_data['is_goal_missed']
goals_scored_counts = comparison_data['is_goal_scored']

# Set up figure and axis
fig, ax = plt.subplots(figsize=(14, 6))

# Plot missed chances and goals scored for each team
bar_width = 0.35
bar1 = ax.bar(teams, missed_chances_counts, width=bar_width, label='Missed Chances', color='yellow')
bar2 = ax.bar(teams, goals_scored_counts, width=bar_width, label='Goals Scored', color='green', bottom=missed_chances_counts)

# Add labels, title, and legend
ax.set_xlabel('Team')
ax.set_ylabel('Count')
ax.set_title('Comparison of Missed Chances and Goals Scored for Top Teams')
ax.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Display the bar chart
plt.show()



# Group data by player and sum the goals for each player and Display the top ten strikers

In [None]:
top_scorers = data.groupby('player')['is_goal'].sum().reset_index()

# Sort the DataFrame by total goals in descending order
top_scorers = top_scorers.sort_values(by='is_goal', ascending=False)

top_ten_strikers = top_scorers.head(10)
print(top_ten_strikers)


# Top Ten Strikers with Highest Number of Goals

In [None]:
import matplotlib.pyplot as plt

players = top_ten_strikers['player']
goals = top_ten_strikers['is_goal']

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(players, goals, color='blue')
plt.xlabel('Player')
plt.ylabel('Total Goals')
plt.title('Top Ten Strikers with Highest Number of Goals')
plt.xticks(rotation=45, ha='right')
plt.show()


# Filter data for headers and Display the top players with the most header goals

In [None]:
# Filter data for headers
header_goals = data[data['bodypart'] == 3]

# Group data by player and count the number of header goals for each player
players_header_goals = header_goals.groupby('player')['is_goal'].count().reset_index()

# Sort the DataFrame by the number of header goals in descending order
players_header_goals = players_header_goals.sort_values(by='is_goal', ascending=False)

# Display the top players with the most header goals
top_players_header_goals = players_header_goals.head(10)
print(top_players_header_goals)



# Top Players with the Most Header Goals

In [None]:
import matplotlib.pyplot as plt


players = top_players_header_goals['player']
header_goal_counts = top_players_header_goals['is_goal']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(players, header_goal_counts, color='green')
plt.xlabel('Player')
plt.ylabel('Header Goal Count')
plt.title('Top Players with the Most Header Goals')
plt.xticks(rotation=45, ha='right')
plt.show()


# Filter data for Right foot and Display the top players with the most right footed goals

In [None]:
right_foot_goals = data[data['bodypart'] == 1]

# Group data by player and count the number of goals scored with the right foot for each player
players_right_foot_goals = right_foot_goals.groupby('player')['is_goal'].count().reset_index()

# Sort the DataFrame by the number of right foot goals in descending order
players_right_foot_goals = players_right_foot_goals.sort_values(by='is_goal', ascending=False)
 
top_players_right_foot_goals = players_right_foot_goals.head(10)
print(top_players_right_foot_goals)



# Top Players with the Most Right Foot Goals

In [None]:
import matplotlib.pyplot as plt


players = top_players_right_foot_goals['player']
right_foot_goal_counts = top_players_right_foot_goals['is_goal']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(players, right_foot_goal_counts, color='orange')
plt.xlabel('Player')
plt.ylabel('Right Foot Goal Count')
plt.title('Top Players with the Most Right Foot Goals')
plt.xticks(rotation=45, ha='right')
plt.show()


# Filter data for goals scored with the left foot and Display the top players with the most left foot goals

In [None]:
left_foot_goals = data[data['bodypart'] == 2]

# Group data by player and count the number of goals scored with the left foot for each player
players_left_foot_goals = left_foot_goals.groupby('player')['is_goal'].count().reset_index()

# Sort the DataFrame by the number of left foot goals in descending order
players_left_foot_goals = players_left_foot_goals.sort_values(by='is_goal', ascending=False)

top_players_left_foot_goals = players_left_foot_goals.head(10)
print(top_players_left_foot_goals)


# Top Players with the Most Left Foot Goals

In [None]:
import matplotlib.pyplot as plt

# Assuming you already have the top_players_left_foot_goals DataFrame
players = top_players_left_foot_goals['player']
left_foot_goal_counts = top_players_left_foot_goals['is_goal']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(players, left_foot_goal_counts, color='purple')
plt.xlabel('Player')
plt.ylabel('Left Foot Goal Count')
plt.title('Top Players with the Most Left Foot Goals')
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
#as we only focus on shots for this analysis, let's create another dataframe that only keeps the shot events
data_shot = data[data.event_type == 1]

In [None]:
messi = (data_shot.player == 'lionel messi')
ronaldo = (data_shot.player == 'cristiano ronaldo')

In [None]:
nb_shot_messi = data_shot.id_odsp[messi].count()
nb_goal_messi = data_shot.id_odsp[messi][data_shot.is_goal == 1].count()
ratio_messi = nb_goal_messi / nb_shot_messi

nb_shot_ronaldo = data_shot.id_odsp[ronaldo].count()
nb_goal_ronaldo = data_shot.id_odsp[ronaldo][data_shot.is_goal == 1].count()
ratio_ronaldo = nb_goal_ronaldo / nb_shot_ronaldo

print('Number of goals for Messi : ', nb_goal_messi)
print('Goal/shot ratio for Messi : ', ratio_messi)
print('Number of goals for Ronaldo : ', nb_goal_ronaldo)
print('Goal/shot ratio for Ronaldo : ', ratio_ronaldo)

In [None]:
print ('Number of shots not located : ', data_shot.is_goal[data.location == 19].count()) 
print ('Split by goal or no goal : ', data_shot.is_goal[data.location == 19].value_counts()) 
print('\
      ')
print('Number of shot recorded', data_shot.is_goal.count())

We are going to drop those N/A values for the shot location, as they represent less than 6% of the goals. Unfortunately almost all of them are goals, but we should have enough data.

Of course, at the end some players will see their number of goals decresead, but we judge it acceptabl

In [None]:
data_shot = data_shot[data_shot.location != 19]

In [None]:
data_shot.count()

In [None]:
X = data_shot[['time', 'side', 'bodypart', 'location', 'situation', 'assist_method', 'fast_break']]
y = data_shot['is_goal']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, stratify = y)

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(),
    LinearSVC()]


for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
print("="*30)