In [1]:
import pandas as pd
pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from urllib.request import urlopen
from bs4 import BeautifulSoup

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
def createRawData(year):

    # URL page we will scraping (see image above)
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]

    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    stats = pd.DataFrame(player_stats, columns = headers)
    stats = stats.drop('Pos', axis=1)
    
    return stats

In [3]:
def preprocessData(stats):
    # Deleting duplicates
    stats.drop_duplicates(subset='Player', keep='first',inplace=True)
    
    stats.set_index("Player",inplace=True) # Player is index value

    # Dropping unrelated values/unknowns
    stats.drop('Tm', 1, inplace=True)
    stats.dropna(inplace=True)

    # One hot encoding player position
#     stats = pd.get_dummies(stats, columns=['Pos'])


    # Checking for autocorrelation and dropping those columns
    stats = stats.apply(pd.to_numeric)
    corr = stats.corr()

    threshold = 0.90

    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    
    selected_columns = stats.columns[columns]
    stats = stats[selected_columns]

    # Standardizing feature dataset
    scaler = preprocessing.StandardScaler()
    scaler.fit(stats)
    x_scaled = scaler.transform(stats)
    stats_x = pd.DataFrame(x_scaled, columns = stats.columns)
    
#     stats_x = stats_x.iloc[:, :12]

    # Replacing NAN values after standardization
    stats_x = stats_x.fillna(0)

    # Concatenating standardized data with one hot encoded data
#     pos_stats = stats.iloc[:,11:]
#     pos_stats.reset_index(inplace = True)
#     pos_stats.drop('Player', 1, inplace=True)
    
    #print("POS STATS: ", pos_stats, "STATS_X", stats_x)

#     pp_data = pd.concat([stats_x,pos_stats],axis=1) # Final preprocessed data
    
    return stats_x

In [4]:
def addTarget():
    mvp = []
    for i in range(530):
        if i == 18:
            mvp.append(1)
        else :
            mvp.append(0)
    mvp_df = pd.DataFrame(mvp)
    mvp_df = mvp_df.rename(columns={0:"MVP"})
    return mvp_df

In [5]:
stats_2018 = createRawData(2018)
stats_2019 = createRawData(2019)
pp_2018 = preprocessData(stats_2018)
pp_2019 = preprocessData(stats_2019)

In [6]:
x = pp_2019
y = addTarget()

In [None]:
lr = LogisticRegression()
lr.fit(x,y)

In [8]:
results = lr.predict_proba(pp_2018)

In [None]:
max_ = 0
min_ = 100000
maxIndex = None
minIndex = None
arr = results[:,1]
for i in range(0, arr.size):
    if (arr[i] > max_):
        max_ = arr[i]
        maxIndex = i
    if (arr[i] < min_):
        min_ = arr[i]
        minIndex = i
print("Maximum value: {} at index {}".format(max_, maxIndex))

In [11]:
stats_2018.iloc[maxIndex, :]

Age       23
G         75
GS        75
MP      36.7
FG       9.9
FGA     18.7
FG%     .529
3P       0.6
3PA      1.9
3P%     .307
2P       9.3
2PA     16.8
2P%     .554
eFG%    .545
FT       6.5
FTA      8.5
FT%     .760
ORB      2.1
DRB      8.0
TRB     10.0
AST      4.8
STL      1.5
BLK      1.4
TOV      3.0
PF       3.1
PTS     26.9
Name: Giannis Antetokounmpo, dtype: object