In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/nba2k20-player-dataset/nba2k20-full.csv')
data.head()

In [None]:
# convert b_day column to get just year
data['b_day'] = pd.to_datetime(data['b_day']).dt.year

# get height in meters
data['height'] = pd.to_numeric(data['height'].apply(lambda x: x.split('/')[1]))

# get weight in kgs
data['weight'] = pd.to_numeric(data['weight'].apply(lambda x: x.split('/')[1].split('kg')[0]))

In [None]:
data.info()

In [None]:
# convert salary to int64 by removing $ sign
data['salary'] = pd.to_numeric(data['salary'].apply(lambda x: x[1:]))

# convert draft_peak to int64 by replacing 'Undrafted' to 0
data['draft_peak'] = pd.to_numeric(data['draft_peak'].replace('Undrafted', 0))

# update position column (determined in EDA)
data['position'] = data['position'].map({'F':'F', 'F-G': 'F-G', 'G':'G', 'F-C':'F-C', 'C':'C', 'G-F':'F-G', 'C-F':'F-C'})

# EDA

In [None]:
# check distribution of target
sns.histplot(data['salary'])

In [None]:
def plot_numerical(feature):
    sns.lmplot(x=feature, y='salary', data=data)
    plt.show()
    
def plot_categorical(feature, figsize=None):
    df = data.groupby([feature])['salary'].describe()[['mean', '50%', 'min', 'count']]

    labels = df.index.values
    x = np.arange(len(labels))
    width = 0.9
    fig, ax1 = plt.subplots(figsize=(18, 5))

    # plot bars for min, median and mean salary
    rects1 = ax1.bar(x-width/2, df['50%'], width/3, label='median')
    rects2 = ax1.bar(x-width/6, df['mean'], width/3, label='mean')
    rects3 = ax1.bar(x+width/6, df['min'], width/3, label='min')

    ax1.set_ylabel('Salary', fontsize=12)
    ax1.set_title(feature, fontsize=15)
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels, rotation=90)
    ax1.legend()

    # plot counts of data points
    ax2 = ax1.twinx()
    ax2.set_ylabel('Counts', fontsize=12)
    ax2.plot(x-width/2, df['count'], color='red', linestyle='dashed')

    # annotate counts of data points
    for i, rect in enumerate(rects2):
        height = int(round(rect.get_height()))
        ax1.annotate('{}'.format(int(df['count'].iloc[i])),
                     xy=(rect.get_x() + rect.get_width()/2, height),
                     xytext=(0, 3), textcoords="offset points",
                     ha='center', va='bottom', color='red')
    plt.show()

In [None]:
for feature in ['rating']:
    plot_numerical(feature)

In [None]:
for feature in ['jersey', 'team', 'college', 'country']:
    plot_categorical(feature)

In [None]:
for feature in ['position']:
    plot_categorical(feature)

In [None]:
for feature in ['b_day', 'height', 'weight']:
    plot_numerical(feature)

In [None]:
for feature in ['draft_year', 'draft_peak']:
    plot_numerical(feature)

In [None]:
plot_categorical('draft_round')

**Columns that are contributing towards high Salary-**
* rating - +ve
* position - 'C' position gets the highest salary
* b_day - aged players are getting higher salary
* draft_year - newer the player lesser is the salary
* draft_round - players drafted in 1st round get higher salary while Undrafted get the least





In [None]:
categorical_features = ['jersey', 'team', 'position', 'country', 'draft_round', 'college']
numerical_features = ['rating', 'b_day', 'height', 'weight', 'draft_year', 'draft_peak']
to_drop = ['full_name'] # contain all unique values

# Label encoding categorical features for correlation

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    le = LabelEncoder()

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))

# CORRELATION

In [None]:
# Bivariate Analysis Correlation plot with the Numeric variables
plt.figure(figsize=(5, 5))
sns.heatmap(round(data[numerical_features].corr(method='spearman'), 2), 
            annot=True, mask=None, cmap='GnBu')
plt.show()

In [None]:
# Bivariate Analysis Correlation plot with the Categorical variables
plt.figure(figsize=(10, 10))
sns.heatmap(round(df[categorical_features+numerical_features+['salary']].corr(method='spearman'), 2), annot=True,
            mask=None, cmap='GnBu')
plt.show()

**Observations:**
* b_day - draft_year
* height - weight
* position - height and weight (-ve)

# Removing features using VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Calculating VIF
vif = pd.DataFrame()
temp = df.dropna()
vif["variables"] = [feature for feature in categorical_features+numerical_features if feature not in ['team', 'college', 'draft_year', 
                                                                                                      'height', 
                                                                                                      'weight', 'b_day', 'country']]
vif["VIF"] = [variance_inflation_factor(temp[vif['variables']].values, i) for i in range(len(vif["variables"]))]
print(vif)

# Handling Missing Values

In [None]:
missingValueFeatures = pd.DataFrame({'missing %': data.isnull().sum()*100/len(data)})
missingValueFeatures[missingValueFeatures['missing %']>0]

team and college are not correlated to any other feature and are highly uncorrelated to each other as well. As the colleg or team are not affecting the salary much, we may drop them while training model

# Handling Categorical Features (Label Encoding & One Hot Encoding)

In [None]:
# update categorical features to use only quality features using vif and correlation observations
# jersey is ignored as it is very sparsed data and also due to lack of good no. of data points
categorical_features = ['position', 'draft_round']

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    
    le = LabelEncoder()
    ohe = OneHotEncoder(sparse=False)

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))
    # load classes
    columns = joblib.load(
        open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'rb'))

    if len(le.classes_)>2:
        # perform one hot encoding
        ohe.fit(df[[feature]])
        # save the encoder
        joblib.dump(ohe, 
                    open(os.path.join(path, "TextEncoding/ohe_{}.sav".format(feature)), 'wb'))

        # transfrom training data
        # removing first column of encoded data to elude from dummy variable trap
        tempData = ohe.transform(df[[feature]])[:, 1:]

        # create Dataframe with columns as classes
        tempData = pd.DataFrame(tempData, columns=columns)
    else:
        tempData = df[[feature]]
    
    # create dataframe with all the label encoded categorical features along with hot encoding
    if i==0:
        encodedData = pd.DataFrame(data=tempData, columns=tempData.columns.values.tolist())
    else:
        encodedData = pd.concat([encodedData, tempData], axis=1)

In [None]:
# merge numerical features and categorical encoded features
df = df[numerical_features+['salary']]
df = pd.concat([df, encodedData], axis=1)
df.info()

# Training Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn import metrics, preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
train_data = df.copy()
feature_cols = [feature for feature in train_data.columns if feature not in(['b_day', 'salary', 'height', 'weight',  'draft_peak'])]

''' Rescaling to [0,1] '''
scaler = StandardScaler()
scaler.fit(train_data[feature_cols])
train_data[feature_cols] = scaler.transform(train_data[feature_cols])

In [None]:
X = train_data[feature_cols]
y = train_data['salary']

validation_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, 
                                                    random_state=0)

# Model 1: Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print('RMSE: ', np.sqrt(mean_squared_error(y_train, y_pred)))
print('Accuracy: ', round(model.score(X_train, y_train)*100, 2))

y_pred = model.predict(X_test)

print('Validation metrics...')
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Accuracy: ', round(model.score(X_test, y_test)*100, 2))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_pred))), y=y_pred,
                         mode='lines',
                         name='Prediction'))
fig.add_trace(go.Scatter(x=list(range(len(y_test))), y=y_test,
                         mode='lines',
                         name='True value'))

fig.show()

In [None]:
model = XGBRegressor( 
    n_estimators = 300,
    learning_rate=0.2, 
    min_child_weight=3,
    max_depth = 2,
    subsample = 0.75,
    seed=0)


model = model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    early_stopping_rounds=20,
    eval_set=[(X_test,y_test)],
    verbose=False)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print('RMSE: ', np.sqrt(mean_squared_error(y_train, y_pred)))
print('Accuracy: ', round(model.score(X_train, y_train)*100, 2))

y_pred = model.predict(X_test)

print('Validation metrics...')
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Accuracy: ', round(model.score(X_test, y_test)*100, 2))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_pred))), y=y_pred,
                         mode='lines',
                         name='Prediction'))
fig.add_trace(go.Scatter(x=list(range(len(y_test))), y=y_test,
                         mode='lines',
                         name='True value'))

fig.show()