In [42]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [43]:
df=pd.read_csv('nba_cc_fake_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Comp,Height,Points,Salary
0,9.0,76.0,27.0,0.0
1,7.0,78.0,39.0,0.0
2,9.0,76.0,39.0,0.0
3,9.0,74.0,39.0,0.0
4,9.0,74.0,26.0,0.0


In [61]:
# Number of rows with Salary = 0 :
df[df.Salary==0].Salary.count()

9413

### 1. Least Squares Regression is not a good model:
Explain why linear regression is not appropriate, given the nature of the data.


Answer: <br>
This is because a majority of the labels are zeroes(9,413 out of 10,000) and when predicting salary, the model tries to fit a line between the zeroes and the rows with non-zero salaries. The lowest non-zero salary is very high compared to zero and thus we get non-zero predictions using least-squares regression. Thus it is not a good idea to fit Linear regression to a problem with such a variation in labels.

### 2. Prediction using Least Squares Regression

Try least squares regression, anyway. How well do you do?

In [69]:
X=df.iloc[:,:3].values
Y=df.iloc[:,3].values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 0)

In [63]:
lr=LinearRegression()
lr.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [64]:
y_pred=lr.predict(X_test)
y_pred

array([ 86344.59732927,  19121.40852818,  11914.96574179, ...,
       -22231.93667264, 127528.22085956, 118773.94590382])

In [65]:
r2_score(Y_test,y_pred)

0.17787434482059772

#### As expected the R2 Score is quite low ~ 0.18

### 3. Composite Model

You will next build a composite model. You will first predict the probability that a player actually makes it to the NBA at all, and then you will build a model to predict the salary of a player, conditioned on the fact of making it to the NBA.<br>
– Build a model that predicts the probability of making it to the NBA.<br>
– Do a train-test split of 8000/2000 points, train your best model on the training set, and
compute the AUC on the test set.<br>
– Now, build a model to predict the salary. Note that you may wish to consider a non-
linear transformation of your data. What is your R2 score on the test set?<br>

#### A. Building classification model to determine if player goes to NBA or not 

In [77]:
# Y_nba with 0,1 instead of salary
Y_nba=Y.copy()
Y_nba[np.where(Y>0)]=1
X_train, X_test, Y_train, Y_test = train_test_split(X,Y_nba,test_size = 0.2, random_state = 0)

In [78]:
def predict_auc(clf, X_train=X_train, X_test=X_test, Y_train=Y_train, Y_test=Y_test):
    clf.fit(X_train, Y_train)
    y_pred=clf.predict_proba(X_test)[:, 1]
    auc_score=roc_auc_score(Y_test,y_pred)
    return round(auc_score,4)

In [79]:
print("AUC Score for Logistic Regression on whole X,Y as training set: ",predict_auc(LogisticRegression(C=10), X,X,Y_nba,Y_nba))
print("AUC Score for Logistic Regression on train_X and AUC on test set: ",predict_auc(LogisticRegression(C=10)))


AUC Score for Logistic Regression on whole X,Y as training set:  0.9395
AUC Score for Logistic Regression on train_X and AUC on test set:  0.9427


In [71]:
# Trying with standardized features
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [80]:
print("AUC Score for Logistic Regresion on whole X,Y as training set: ",predict_auc(LogisticRegression(C=10), X_train_std, X_test_std, Y_train, Y_test))

AUC Score for Logistic Regresion on whole X,Y as training set:  0.9418


#### B. Finding Best model on 80/20 split of data

In [81]:
# Train-test split of 8000/2000 points
X_train, X_test, Y_train, Y_test = train_test_split(X,Y_nba,test_size = 0.2, random_state = 0)

In [82]:
print("AUC Score for DecisionTreeClassifier: ",predict_auc(DecisionTreeClassifier(max_depth=5), X_train, X_test, Y_train, Y_test))
print("AUC Score for RandomForestClassifier: ",predict_auc(RandomForestClassifier(n_estimators=8, max_depth=5), X_train, X_test, Y_train, Y_test))
print("AUC Score for GradientBoostingClassifier: ",predict_auc(GradientBoostingClassifier(n_estimators=30, max_depth=4), X_train, X_test, Y_train, Y_test))
print("AUC Score for AdaBoostClassifier: ",predict_auc(AdaBoostClassifier(n_estimators=100), X_train, X_test, Y_train, Y_test))
print("AUC Score for XGBoost: ",predict_auc(XGBClassifier(n_estimators=100), X_train, X_test, Y_train, Y_test))

AUC Score for DecisionTreeClassifier:  0.925
AUC Score for RandomForestClassifier:  0.9278
AUC Score for GradientBoostingClassifier:  0.9383
AUC Score for AdaBoostClassifier:  0.924
AUC Score for XGBoost:  0.9389


In [86]:
print("AUC Score for DecisionTreeClassifier: ",predict_auc(DecisionTreeClassifier(max_depth=5), X_train_std, X_test_std, Y_train, Y_test))
print("AUC Score for RandomForestClassifier: ",predict_auc(RandomForestClassifier(n_estimators=8, max_depth=5), X_train_std, X_test_std, Y_train, Y_test))
print("AUC Score for GradientBoostingClassifier: ",predict_auc(GradientBoostingClassifier(n_estimators=30, max_depth=4), X_train_std, X_test_std, Y_train, Y_test))
print("AUC Score for AdaBoostClassifier: ",predict_auc(AdaBoostClassifier(n_estimators=100), X_train_std, X_test_std, Y_train, Y_test))
print("AUC Score for XGBoost: ",predict_auc(XGBClassifier(n_estimators=100), X_train_std, X_test_std, Y_train, Y_test))

AUC Score for DecisionTreeClassifier:  0.925
AUC Score for RandomForestClassifier:  0.9283
AUC Score for GradientBoostingClassifier:  0.9383
AUC Score for AdaBoostClassifier:  0.924
AUC Score for XGBoost:  0.9389


In [87]:
# Tuning XGBoost Classifier
import warnings
warnings.filterwarnings("ignore")

xgparameters={  'max_delta_step':[0],
                'n_estimators':[80],
                'booster':['gbtree'],
                'min_child_weight':[0.5],
                'max_depth':[3],
                'learning_rate':[0.1]
             }

clf=GridSearchCV(XGBClassifier(random_state=7),xgparameters,cv=5)
clf.fit(X_train,Y_train)
print(clf.best_params_)
y_pred=clf.predict_proba(X_test)[:,1]

roc_auc_score(Y_test,y_pred)

{'booster': 'gbtree', 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 0.5, 'n_estimators': 80}


0.9373771622842255

In [88]:
print("AUC Score for XGBoost: ",predict_auc(XGBClassifier(n_estimators=80, max_depth=3), X_train, X_test, Y_train, Y_test))

AUC Score for XGBoost:  0.9386


#### C. Model to predict salary conditioned on going to NBA

In [89]:
# For linear regression model conditioned on probability of going to NBA
# We took the probability of going to NBA as a feature: Column 4
clf=XGBClassifier()
clf.fit(X,Y_nba)
y_pred=clf.predict_proba(X)
new_X=np.concatenate((X,y_pred), axis=1)
new_X=new_X[:,[0,1,2,4]]

In [90]:
lr = LinearRegression()
lr.fit(new_X,Y)
salary=lr.predict(new_X)
r2_score(Y,salary)

0.4887458866200912

#### To cross-verify this we took the rows from main dataframe where Salary is not 0 and found an r2_score similar to ours.

In [91]:
X_salary = df[df['Salary']>0].iloc[:,:3].values
Y_salary = df[df['Salary']>0].iloc[:,3].values

In [92]:
lr = LinearRegression()
lr.fit(X_salary,Y_salary)
salary=lr.predict(X_salary)
r2_score(Y_salary,salary)

0.5685697969800619

### 4. Predict for given player

Compute the expected NBA salary of a high school basketball player who is 6’ 6” tall, is
averaging 46 points per game, and is playing in the second most competitive league (comp =
9), according to your model.

In [93]:
predict_X=[9.0, 78.0, 46.0]

In [94]:
clf.predict_proba(predict_X)[:,1]

array([0.09216729], dtype=float32)

### The given player has a 0.09 probability of not going for NBA, hence salary prediction will be 0.