In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
import numpy as np

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/nba2k20-player-dataset/nba2k20-full.csv')

In [None]:
train.shape
list(train.columns)

In [None]:
train.head()

In [None]:
for col in train.drop(['full_name','b_day','height','weight', 'salary'],axis=1):
    print(train[col].unique())

In [None]:
train['draft_round'].replace('Undrafted',0,inplace=True)
train['draft_round'] = train['draft_round'].astype('int64')
train['draft_peak'].replace('Undrafted',0,inplace=True)
train['draft_peak'] = train['draft_peak'].astype('int64')
train['draft_round'].unique()

In [None]:
train['position'] = train['position'].map({'F':'F', 'F-G': 'F-G', 'G':'G', 'F-C':'F-C', 'C':'C', 'G-F':'F-G', 'C-F':'F-C'})

In [None]:
def weight_conv(weight):
    return str(weight).split('/')[1].split(' ')[1]
def remove_dollar(salary):
    return str(salary).lstrip('$')
def age(date):
    bday = datetime.datetime.strptime(date,"%m/%d/%y")
    today = datetime.date.today()
    return today.year - bday.year
def jersey(jersey):
    return str(jersey).lstrip('#')
def height_clean(height):
    return height.split('/')[1].split(' ')[1]

In [None]:
train.isnull().sum()

In [None]:
train['weight'] = train['weight'].apply(lambda x: weight_conv(x))
train['salary'] = train['salary'].apply(lambda x: remove_dollar(x)).astype("float")
train['jersey'] = train['jersey'].apply(lambda x: jersey(x)).astype('int32')
train['age'] = train['b_day'].apply(lambda x: age(x)).astype('float')
train['height'] = train['height'].apply(lambda x: height_clean(x)).astype('float')

In [None]:
train['weight'] = train['weight'].astype('float')

In [None]:
train.rename({"height" : "height(in_m)", "weight": "weight(in_kg)"},axis=1,inplace=True)

In [None]:
train['exp'] = 2021 - train['draft_year']

In [None]:
train.drop(['b_day', 'draft_year'],axis=1,inplace=True)

In [None]:
train.dtypes

In [None]:
train['exp'] = train['exp'].apply(lambda x: 1 if x >= 10 else 0)

In [None]:
train['exp'].unique()

In [None]:
sns.relplot(data=train, x = 'age', y='salary', hue='position')

In [None]:
sns.barplot(data=train, x='age',y='salary',hue='draft_round')

In [None]:
g = sns.PairGrid(train, y_vars=['salary'],x_vars=['age','position','exp'], hue='draft_round',aspect=1.1,height=4.5)
ax = g.map(plt.scatter, alpha=0.6)

In [None]:
train.columns

In [None]:
X = train.drop(['full_name', 'jersey','team','salary','college'],axis=1)
y = train.salary

In [None]:
X.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['position'] = le.fit_transform(X['position'])

In [None]:
X['draft_round'] = le.fit_transform(X['draft_round'])

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_check = X.drop(['height(in_m)','age','country'],axis=1)
vif = pd.DataFrame()
vif['feature'] = vif_check.columns
vif["Vif"] = [variance_inflation_factor(vif_check.values,i) for i in range(len(vif_check.columns))]
vif

In [None]:
vif_check = X.drop(['height(in_m)','age','weight(in_kg)','country','draft_round'],axis=1)
vif = pd.DataFrame()
vif['feature'] = vif_check.columns
vif["Vif"] = [variance_inflation_factor(vif_check.values,i) for i in range(len(vif_check.columns))]
vif

In [None]:
X.drop(['height(in_m)','age','weight(in_kg)','country','draft_round'],axis=1,inplace=True)

In [None]:
X.columns

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train,y_valid = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 300)
rf.fit(X_train,y_train)
pred2 = rf.predict(X_valid)

In [None]:
print('RMSE ', np.sqrt(mean_squared_error(y_valid,pred2)))
print('Accuracy ', round(rf.score(X_train,y_train)*100,2))

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(pred2))), y=pred2,
                         mode='lines',
                         name='Prediction'))
fig.add_trace(go.Scatter(x=list(range(len(y_valid))), y=y_valid,
                         mode='lines',
                         name='True value'))

fig.show()

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators = 300,
    learning_rate=0.2, 
    min_child_weight=3,
    max_depth = 2,
    subsample = 0.75,
    seed=0)
xgb.fit(X_train,y_train)
pred3 = xgb.predict(X_valid)

In [None]:
print('RMSE ', np.sqrt(mean_squared_error(y_valid,pred3)))
print('Accuracy', round(xgb.score(X_train,y_train)*100,2))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(pred3))), y=pred3,
                         mode='lines',
                         name='Prediction'))
fig.add_trace(go.Scatter(x=list(range(len(y_valid))), y=y_valid,
                         mode='lines',
                         name='True value'))

fig.show()