In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('../input/nba2k20-player-dataset/nba2k20-full.csv')

In [None]:
df.head()

In [None]:
#dropping unnecessary values 

df.drop('jersey', axis = 1, inplace = True)
df.drop('college', axis = 1, inplace = True)

In [None]:
#renaming misspelled or edited columns

df.rename(columns={'b_day' : 'age'}, inplace=True)
df.rename(columns={'draft_peak' : 'draft_pick'}, inplace=True)

In [None]:
#processing the data to prepare for analysis 

df['salary'] = df['salary'].str.replace('$', '')
df['draft_round'] = df['draft_round'].str.replace('Undrafted', '0')
df['draft_pick'] = df['draft_pick'].str.replace('Undrafted', '0')

In [None]:
#fixing discrepancy in datatypes

df[['salary', 'draft_year', 'draft_pick']] = df[['salary', 'draft_year', 'draft_pick']].apply(pd.to_numeric)

In [None]:
#removing metric measurements

df["age"] = pd.to_datetime(df["age"]).dt.year
df["height"] = df["height"].str.split("/").str[1].astype("float")
df["weight"] = df["weight"].str.split("/").str[1].str[0:-3].astype("float")

In [None]:
#changing from date of birth to current age as of 2021

year = 2021
df['age'] = year - df['age'].values

In [None]:
df['draft_round'] = df['draft_round'].apply(pd.to_numeric)

In [None]:
df.dtypes

In [None]:
#updated dataframe looking much better for analysis and prediction

df

In [None]:
#creating a copy of preserve the original dataset and apply the new one to our machine learning algorithms 

df2 = df.copy()

In [None]:
df2['draft_year'] = year - df2['draft_year'].values
df2.rename(columns={'draft_year' : 'active seasons'}, inplace=True)

In [None]:
#data exploration

plt.figure(figsize=(10, 5))
plt.title("Salary distribution by team", fontsize=18)
x = sns.boxplot(x="team", y="salary", data=df)
plt.xticks(rotation = 'vertical')

In [None]:
plt.figure(figsize = (8,5))
plt.title("Salary by age", fontsize=18)
sns.stripplot(x='age', y='salary', data = df2, jitter = True, alpha = 0.5)
sns.pointplot(x='age', y='salary', data = df2, join = False, palette = 'muted')

In [None]:
plt.figure(figsize = (8,5))
plt.title("Salary by seasons played", fontsize=18)
sns.stripplot(x='active seasons', y = 'salary', data = df2)

In [None]:
plt.figure(figsize = (13,6))
plt.title("Salary by draft pick", fontsize=18)
sns.barplot(x='draft_pick', y = 'salary', data = df)
sns.pointplot(x = 'draft_pick', y = 'salary', data = df, scale = 0.6, color = 'k')

In [None]:
p = sns.boxplot(x='position', y='salary', data = df)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['C','C-F','F','F-C','F-G','G','G-F'])

In [None]:
plt.figure(figsize=(10, 5))
plt.title("Rating by team", fontsize=18)
x = sns.boxplot(x="team", y="rating", data=df)
plt.xticks(rotation = 'vertical')

In [None]:
#encoding columns

from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()

encode.fit(df.position.drop_duplicates())
df.position = encode.transform(df.position)

encode.fit(df2.position.drop_duplicates())
df2.position = encode.transform(df2.position)

In [None]:
df.corr()

In [None]:
#the heatmap visualizes correlation between various factors

plt.figure(figsize = (8,6))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap = 'vlag', square = True)

In [None]:
#dropping columns that don't contribute to the prediction

df2.drop(['full_name','country','draft_round','draft_pick'], axis = 1, inplace = True)

In [None]:
df2.drop('team', axis = 1, inplace = True)

In [None]:
#rearranging columns

df2 = df2[['rating','position','age','height','weight','active seasons','salary']]

In [None]:
#final dataset to be used in our algorithms 

df2

In [None]:
#Here we enter the predictive analysis portion of the project
#We are predicting the salary of NBA players based on their NBA 2K ratings, position, age, height, weight, and number of seasons
#played

x = df2.iloc[:, 0:5].values
y = df2.iloc[:, -1].values

In [None]:
#splitting the variables in 90:10 ratio to provide enough training data as our dataset is pretty small

from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(x,y, test_size = 0.10, random_state = 0)

In [None]:
print(xTrain.shape)
print(yTrain.shape)

In [None]:
#linear regression resulting in a accuracy of 80%, pretty good considering our training set has less than 400 values

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(xTrain, yTrain)

pred = lr.predict(xTest)

print('Linear Regression Score: %.3f' % lr.score(xTest,yTest))

In [None]:
plt.figure(figsize = (8,5))
sns.regplot(x = 'rating', y = 'salary', data  = df, color = 'royalblue')
plt.title('Linear Regression Plot')

In [None]:
f, ax = plt.subplots(1, figsize=(6, 3), sharex=True)

sns.stripplot(x = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(x = pred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Predicted Data')
plt.xlabel('salary')
plt.title('Linear Regression Accuracy')
plt.legend()
plt.show()

In [None]:
#Random Forest Regressor yielding a similar accuracy as the Linear Regression Model

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

rfr = RandomForestRegressor(n_estimators = 43, random_state = 0)
rfr.fit(xTrain, yTrain)

rfrYpred = rfr.predict(xTest)

rfrScore = r2_score(yTest,rfrYpred)
print('R2 score: %.3f' % rfrScore)

In [None]:
f, ax = plt.subplots(1, figsize=(6, 3), sharex=True)

sns.stripplot(x = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(x = pred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Predicted Data')
plt.xlabel('salary')
plt.title('Random Forest Regressor Accuracy')
plt.legend()
plt.show()

In [None]:
#Support Vector Classifier accuracy reaching about 70%

from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(xTrain, yTrain)

svm = svclassifier.predict(xTest)

svmScore = r2_score(yTest,svm)
print('R2 score: %.3f' % svmScore)

In [None]:
f, ax = plt.subplots(1, figsize=(6, 3), sharex=True)

sns.stripplot(x = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(x = pred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Predicted Data')
plt.xlabel('salary')
plt.title('SVM Accuracy')
plt.legend()
plt.show()

Our models gave us a relatively high accuracy given the fact that the dataset had only 429 rows. This exact project done in the year 2050 will yield a higer accuracy as the NBA and NBA 2K will have more players who would've played and been featured in the videogame. 