In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Itroduction

   ![](https://i.ibb.co/7YNWZy3/main-photo.png)

In this notebook I will atempt to create model predciting NBA player salary based on his stats from NBA season before the signing. I will used data from this [dataset](https://www.kaggle.com/jarosawjaworski/current-nba-players-contracts-history) I created for the perpouse of this project. It is also my first original data sciene project.

# Imports

In [None]:
import pandas as ps
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import operator
from collections import OrderedDict
from sklearn import preprocessing
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Data Visualisatin

In [None]:
all_contracts = pd.read_csv('../input/current-nba-players-contracts-history/nba_contracts_history.csv')
all_contracts["AGE"] = all_contracts["AGE"].astype(int)
all_contracts["AVG_SALARY"] = all_contracts["AVG_SALARY"].astype(int)

**Ten highest salaries from dataset:**

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
all_contracts.loc[:, ["NAME","AGE", "CONTRACT_START", "CONTRACT_END", "AVG_SALARY"]].sort_values(by="AVG_SALARY", ascending=False).head(10)

**Average Player Salary, By Age:**

In [None]:
plt.figure(figsize=(10,6))
plt.title("Average Salary, By Age")

sns.barplot(x=all_contracts.AGE, y=all_contracts['AVG_SALARY'], ci=None)

plt.ylabel("Salary")
plt.xlabel("Age")

Players earn the most when there are between 27 and 32 years old. There is also notable jump in salary values at age 23. It might be caused by the fact that most promosing players enter the league at age 19/20 after one year of collage, and they sign their second contract at age 23.

**Average Player Salary, By +/- stat:**

In [None]:
sns.lmplot(x="+/-", y="AVG_SALARY", data=all_contracts,aspect=2)

plt.ylabel("Salary")
plt.xlabel("+/-")

The plot shows that players with better +/- stat value earn a bit more money on average.

**Heatmap of basic stats:**

In [None]:
player_stats = all_contracts.drop(columns=["NAME", "AGE", "CONTRACT_START", "CONTRACT_END", "GP", 
                                           "AVG_SALARY", "+/-", "FG%", "3P%", "FT%", "W", "L"])
stats_normalized = (player_stats-player_stats.mean()) / player_stats.std()

corr = stats_normalized.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


The overall idea shown by this plot is that having more stats in one categorie corelate with having more in other ones. Only parings of stats that break this pattern are rebounds/blocks with 3PM/3PM/AST. It probably because players, which acumulate big number of rebounds and block play centere or power forward position and they on average shoot and pass less.

# Data Prepartion For The Model

In [None]:
all_contracts_clean = all_contracts.drop(columns=["NAME", "CONTRACT_START", "CONTRACT_END"])

X_full = all_contracts_clean[:139]
X_test_full = all_contracts_clean[140:]

y = X_full.AVG_SALARY
X = X_full.drop(columns=["AVG_SALARY"])

y_test = X_test_full.AVG_SALARY
X_test = X_test_full.drop(columns=["AVG_SALARY"])


X = X.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.9, test_size=0.1,
                                                      random_state=0)

# Features importance

In [None]:
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
contracts_preds = model.predict(X_valid)
print(f"Validation MAE for Random Forest Model: {mean_absolute_error(y_valid, contracts_preds)}")

# Creating A Model Using Random Forest Regression

In [None]:
importance = model.feature_importances_
columns_names = list(X.columns)
importance_sorted_list = sorted(zip(importance ,columns_names), reverse=True)
for v in (importance_sorted_list):
    print(f"{v[1]} - Score: {round(v[0], 3)}")


The biggest impact on player salary have points and suprising free throws made and attempted. The least important are offensive rebound, three pointers made and personal fauls.

# Testing The Model

In [None]:
y_pred = model.predict(X_test)
test_data_with_all_columns = all_contracts[140:]
results = test_data_with_all_columns.loc[:, test_data_with_all_columns.columns.intersection(['NAME', 'CONTACT_START', 'CONTRACT_END', 'AVG_SALARY'])]
results['PREDICTED_SALARY'] = y_pred.astype(int)
results = results.rename(columns={'AVG_SALARY': 'ACTUAL_SALARY'})
results["DIFFRENCE"] = results['PREDICTED_SALARY'] - results["ACTUAL_SALARY"]
results.sort_values(by="DIFFRENCE", ascending = False)

According to the model the model the best contracts from test data were signed Wilson Chandler in 2015 and Andre Iguduala in 2016. The most overpaid players were Brook Lopez in 2014(probably because he was injured in 2013) and Ricky Rubio in 2018.

# Conclusion And  Plans For Further Improvements

My main goal of this project was to went through all the steps required in machine learning project, and I am happy to achieve that. I am aware of a lot of improvements that can me done to increase quality of the model, but right now I move on and revisit this project after I gain more knowledge in data science field. 

**This is the list of things that can be improved:**
* I spent really little time on the improving the model. There are various methods I am aware of that can help with increasing quality of model's predictions. When I return to this project my goal will be to significantly extend this part of the project.

* The salary shouldn't be represented in flat dollar value of the contract, but rather a percentage of the salary cap of the time  the specific contract takes. This makes predictions more right when the salary cap changes. Right now model undervalues more recent contracts, because the cap is increasing(at least until 2020/2021 season)

* Player stats should be represented with per game values rather than as totals. If the player was injured during the season before the contract was sign his predicted salary will be too low.

* Stats could be taken from the last 3 seasons average instead of only  from the last season.

* The data set is limited, doesn't include all players from last 10 seasons.