# Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("/kaggle/input/basketball-players-stats-per-season-49-leagues/players_stats_by_season_full_details.csv")
df.head()

# EDA

In [None]:
# only keep the euroleague information
df = df[df.League  == "Euroleague"]
df.head()

In [None]:
df.Season.iloc[1].split("-")[1].split(" ")[1]

In [None]:
# season is only equal to the last value
year = []
for i in range(len(df.Season)):
    year.append(df.Season.iloc[i].split("-")[1].split(" ")[1])

In [None]:
df.Season = year
df.head()

In [None]:
df.info()

In [None]:
# calculate the field goals success rate
df["field_goal_per"] = df["FGM"] / df["FGA"] * 100
df.drop("FGM", axis = 1, inplace=True)
df.drop("FGA", axis = 1, inplace=True)

# calculate the three points success rate
df["three_points_per"] = df["3PM"] / df["3PA"] * 100
df.drop("3PM", axis = 1, inplace=True)
df.drop("3PA", axis = 1, inplace=True)

# calculate the free throws success rate
df["free_throws_per"] = df["FTM"] / df["FTA"] * 100
df.drop("FTM", axis = 1, inplace=True)
df.drop("FTA", axis = 1, inplace=True)

# drop the full birth date
df.drop("birth_date", axis = 1, inplace=True)

# drop height in feet
df.drop("height", axis = 1, inplace=True)

# drop weight in pounds
df.drop("weight", axis = 1, inplace=True)

In [None]:
#Points per game
df['points_per_game'] = df['PTS'] / df['GP']

#Minutes per game
df['min_per_game'] = df['MIN'] / df['GP']

#Turnover per game
df['turnover_per_game'] = df['TOV'] / df['GP']

#Fauls per game
df['fauls_per_game'] = df['PF'] / df['GP']

In [None]:
df = df.drop(['MIN', 'TOV', 'PF', 'PTS'], axis=1)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# we have several values for each player on each year. we only care about their latest team since they will join euroleague with them.
df[df.Player =="Shane Larkin"]

In [None]:
df = df.sort_values('Season', ascending=False).drop_duplicates('Player', keep='first')
df

In [None]:
df[df.Player =="Shane Larkin"]

In [None]:
df = df.reset_index()

In [None]:
df = df.drop("index", axis = 1)

In [None]:
corr = df.corr()
sns.heatmap(corr);

# Data Visualization

In [None]:
df.Team.value_counts()

In [None]:
df.Team.value_counts().head(30).index
df.Team.value_counts().head(30).values

sns.barplot(x= df.Team.value_counts().head(30).index, y=df.Team.value_counts().head(30).values)
plt.xticks(rotation=90)
plt.xlabel("Team Name")
plt.ylabel("Number of Players")
plt.title("Teams that Players Played the Most Recently");

In [None]:
top_nationalities = df.nationality.value_counts().head(30)

In [None]:
sns.barplot(x = top_nationalities.index, y = top_nationalities.values)
plt.xticks(rotation=90)
plt.xlabel("Nationality")
plt.ylabel("Number of Players")
plt.title("Nationality of Players");

In [None]:
high_school = df["high_school"].value_counts().head(20)

In [None]:
sns.barplot(x = high_school.index, y = high_school.values)
plt.xticks(rotation=90)
plt.xlabel("High School Name")
plt.ylabel("Number of Players")
plt.title("High School of Players");

In [None]:
# I will drop the high school columns since it is not a good predictor
df.drop("high_school", axis = 1, inplace=True)

In [None]:
# check if a specific birth month brings more points

sns.relplot(x="birth_month", y="points_per_game", data=df, kind="scatter", hue="birth_year")
plt.xlabel("Birth Month")
plt.ylabel("Points Per Game")
plt.title("Relation Between Birth Month and Points Scored");

In [None]:
sns.relplot(x="free_throws_per", y="points_per_game", data=df, kind="scatter", hue="turnover_per_game");

In [None]:
df.isna().sum()

# Fill the Missing Values

In [None]:
df.head()

In [None]:
# all missing except month and year
df["weight_kg"].fillna(df["weight_kg"].mean, inplace=True)
df["draft_team"].fillna("missing", inplace=True)
df["draft_round"].fillna(df["draft_round"].mean(), inplace=True)
df["draft_pick"].fillna(df["draft_pick"].mean(), inplace=True)
df["three_points_per"].fillna(df["three_points_per"].mean(), inplace=True)
df["free_throws_per"].fillna(df["free_throws_per"].mean(), inplace=True)

In [None]:
df[df["birth_month"].isna()]

In [None]:
# will drop this row since it belongs to a player from an older season
df = df.drop(df.iloc[1294].name, axis = 0)
df = df.reset_index()

In [None]:
df.isna().sum().sum()

In [None]:
df.head()

In [None]:
df.drop("League", axis = 1, inplace = True)

In [None]:
df.drop("Stage", axis = 1, inplace = True)

In [None]:
df.isna().sum().sum()

# Handle Categorical Data

In [None]:
df.info()

In [None]:
# weight column to numeric
df['Season'] = pd.to_numeric(df['Season'], errors='coerce')
df.drop("weight_kg", axis = 1, inplace = True)

In [None]:
cat_col = ["Player", "Team", "birth_month", "nationality", "draft_team"]

In [None]:
dummies = pd.get_dummies(df[cat_col])

In [None]:
df = df.drop(cat_col, axis = 1)
df = pd.concat([df, dummies], axis = 1)

In [None]:
df.head()

In [None]:
df = df.drop("index", axis = 1)

In [None]:
set(df.columns) - set(df.select_dtypes(include=np.number).columns.tolist())

# Split the Data

In [None]:
# can we predict points per game
X = df.drop("points_per_game", axis = 1)
y = df["points_per_game"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=50)

# Modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

In [None]:
# random forest regressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# lasso
model = linear_model.Lasso(alpha=0.1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# SVR
model = SVR()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# linear regression
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)