In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/epl-player-shooting-stats-23-24-premier-league/player_shooting_2023_2024.csv')
df.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
df.describe()

In [None]:
from sklearn.preprocessing import LabelEncoder

positions = df['Pos']

label_encoder = LabelEncoder()

encoded_positions = label_encoder.fit_transform(positions)

df['Pos'] = encoded_positions

In [None]:
df.head()

Column Descriptions:

Rk: Index of the player in the list.
1. Player: Name of the player.
2. Nation: Nationality of the player.
3. Pos: Position of the player on the field.
4. Squad: Team the player belongs to.
5. Age: Age of the player at the time of Aug 1st 2023(season start).
6. Born: Birth year of the player.
7. 90s: Number of 90-minute intervals the player participated in.
8. Gls: Total goals scored by the player.
9. Sh: Total shots taken by the player.
10. SoT: Shots on target by the player.
11. SoT%: Shot accuracy percentage.
12. Sh/90: Shots per 90 minutes.
13. SoT/90: Shots on target per 90 minutes.
14. G/Sh: Goals per shot.
15. G/SoT: Goals per shot on target.
16. Dist: Average distance of shots taken by the player.
17. FK: Free kicks taken by the player.
18. PK: Penalty kicks made by the player.
19. PKatt: Penalty kick attempts by the player.
20. xG: Expected goals.
21. npxG: Non-penalty expected goals.
22. npxG/Sh: Non-penalty expected goals per shot.
23. G-xG: Difference between actual goals and expected goals.
24. np:G-xG: Difference between non-penalty actual goals and non-penalty expected goals.
25. Matches: Link to matches played as a str.

In [None]:
df.isnull().sum()

In [None]:
c_features = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
for feature in c_features:
    most_frequent_value = df[feature].mode()[0]
    df[feature] = df[feature].fillna(most_frequent_value)

In [None]:
numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
for feature in numerical_features:
    mean_value = df[feature].mean()
    df[feature] = df[feature].fillna(mean_value)

In [None]:
df.isnull().sum()

In [None]:
# features

In [None]:
player_info=df.groupby('Player').agg({
    'Nation': 'first',
    'Pos': 'first',
    'Squad': 'first',
    'Age': 'first',
    'Gls': 'sum'  # Total goals scored by the player
})

In [None]:
top_player_info=player_info.sort_values(by='Gls', ascending=False)
top_player_info.head(10)

In [None]:
features = ['Pos', 'Age', '90s', 'Gls', 'Sh', 'SoT', 'SoT%', 'xG', 'G-xG']
X = df[features]
y = df['G/Sh']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor(random_state=1)
decision_tree.fit(X_train, y_train)

In [None]:
predictions = decision_tree.predict(X_test)
print(predictions)
# for i in range(len(X_test)):
#   print(f"Predicted G/Sh: {predictions[i]:.2f}, Actual G/Sh: {X_test.iloc[i]['actual_G_Sh']:.2f}")

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error is :", mse)

In [None]:
# Feature importances
feature_importances = decision_tree.feature_importances_
print("Feature Importances:", feature_importances)

In [None]:
# hyperparamerter tuning
tuned_decision_tree = DecisionTreeRegressor(max_depth=5, random_state=1)
tuned_decision_tree.fit(X_train, y_train)
tuned_predictions = tuned_decision_tree.predict(X_test)
tuned_mse = mean_squared_error(y_test, tuned_predictions)
print("Tuned Mean Squared Error:", tuned_mse)

In [None]:
plt.figure(figsize=(16,8))
plt.title('Graph 1')
plt.xlabel('xG')
plt.ylabel('Gls')
plt.scatter(df['xG'], df['Gls'])
plt.show()

In [None]:
positions = df['Pos'].unique()
metrics = ['Sh/90', 'SoT%', 'G/Sh']

fig, axs = plt.subplots(1, len(metrics), figsize=(15, 6))

for i, metric in enumerate(metrics):
  avg_values = df.groupby('Pos')[metric].mean()
  axs[i].bar(positions, avg_values)
  axs[i].set_title(metric)
  axs[i].set_xlabel('Playing Position')
  axs[i].set_ylabel(metric)

# plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns  # Using seaborn for boxplots

metric = 'G/Sh'  # Choose the shooting metric for comparison

# Create a figure for the subplot
fig, ax = plt.subplots(figsize=(8, 6))  # Adjust figure size as needed

# Create the boxplot
sns.boxplot(
    x = "Nation",
    y = metric,
    showmeans=True,
    data=df,
    ax=ax
)
plt.xticks(fontsize=5)
ax.set_title(metric)
ax.set_xlabel('Nationality')
ax.set_ylabel(metric)

plt.tight_layout()
plt.show()