# Importing data and data cleaning.

Uploading file

In [None]:
import pandas as pd
import numpy as np


Reading uploaded file

In [None]:
df = pd.read_csv("../input/nba2k20-player-dataset/nba2k20-full.csv")
df

In [None]:
# getting information on Dataset
df.info()

In [None]:
# checking for null values.
df.isna().sum()

Null values are there  in team and college columns that means they haven'net gone to a college and neither belong to any team. We can replace it by 'not any'.

In [None]:
df.replace(to_replace={np.NaN:"Not any"},inplace = True)
df.isna().sum()

In [None]:
df.info()

Replacing all the strings to numbers

In [None]:
# taking height in metres
replace_dict = {height: float(height.split("/")[1].strip()) for height in df['height'].unique()}
df.replace(to_replace=replace_dict,inplace = True)

In [None]:
# removing '#' from jersey
replace_dict = {jersey: int(jersey[1:]) for jersey in df['jersey'].unique()}
df.replace(to_replace=replace_dict,inplace = True)

In [None]:
# taking weight in kg
replace_dict = {weight: float((weight.split("/")[1].replace(" kg.","")).strip()) for weight in df['weight'].unique()}
df.replace(to_replace=replace_dict,inplace = True)

In [None]:
# replacing '$' from salary
replace_dict = {salary: int(salary[1:]) for salary in df['salary'].unique()}
df.replace(to_replace=replace_dict,inplace = True)

In [None]:
# replacing 'Undrafted' to '0'.
df.replace(to_replace={"Undrafted": 0},inplace = True)
df[['draft_round','draft_peak']] = df[['draft_round','draft_peak']].astype("int")

Dropping full name columns as it is not needed

In [None]:
df.drop(columns="full_name",inplace=True)
df.info()

Getting dummies for catagorical values

In [None]:
# generating dummies for catagorical columns.
numerical_df = df.select_dtypes(include=['float','int'])
for col in df.select_dtypes(exclude=['int','float']).columns:
  dummy_df = pd.get_dummies(df[col])
  dummy_df.drop(columns = dummy_df.columns[-1],inplace=True)
  numerical_df = pd.concat([numerical_df,dummy_df],axis = 1)

df = numerical_df
df

We see that there are so many columns so we will be take only highly corelate columns.

In [None]:
# taking only highly or moderately correalated columns.
corr_df = df.corr()

hcorf = list(corr_df.loc[(corr_df['salary'] >= 0.2)|(corr_df['salary'] <= -0.2)].index)
hcorf.remove('salary')
hcorf

So lets make a regression model form statsmodels

In [None]:
# seperating features and targets & Train and test.
from sklearn.model_selection import train_test_split

X = df[hcorf]
y = df['salary']

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state = 42,test_size = 0.3)
X_train,X_test,y_train,y_test

In [None]:
# creating first medel using statsmodels 'ols'
import statsmodels.api as sm

# adding constant
X_train_sm = sm.add_constant(X_train)

# fitting the model
lr = sm.OLS(y_train,X_train_sm).fit()

# printing summary
print(lr.summary())

In [None]:
# evaluating errors in the model
from sklearn.metrics import mean_absolute_error,mean_squared_log_error,mean_squared_error

y_test_pred = lr.predict(sm.add_constant(X_test))

print("MAE:",mean_absolute_error(y_test,y_test_pred))
print("MSE:",mean_squared_error(y_test,y_test_pred))


Lets see if only one attrbute is enough

In [None]:
# seeing which feature should be taken for making second model.
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('classic')
plt.figure(figsize = (20,5))
plt.title("Heatmap")
sns.heatmap(df[hcorf + ["salary"]].corr(),annot= True)
plt.show()

We can see that rating is more correlated than draft year so lets make it through rating.

In [None]:
# doing the same
X = df['rating']
y = df['salary']

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state = 42,test_size = 0.3)
X_train,X_test,y_train,y_test

In [None]:
# doing the same
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train,X_train_sm).fit()

print(lr_1.summary())

In [None]:
y_test_pred = np.abs(lr_1.predict(sm.add_constant(X_test)))

print("MAE:",mean_absolute_error(y_test,y_test_pred))
print("MSE:",mean_squared_error(y_test,y_test_pred))


So our fist model was best