In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
pd.set_option('display.max_columns', None)

player_data_df = pd.read_csv('datafiles/player_data.csv')
players_df = pd.read_csv('datafiles/Players.csv')
seasons_df = pd.read_csv('datafiles/Seasons_Stats.csv')

seasons_df = seasons_df.drop('Unnamed: 0', axis=1)


In [2]:
player_data_df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [3]:
players_df = players_df.rename(columns={'collage': 'college'})
players_df.head()

Unnamed: 0.1,Unnamed: 0,Player,height,weight,college,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [4]:
# Removing '*' from all players that had an all star season, and then marking their All-Star season in a separate column 
seasons_df['All Star Season'] = np.zeros(len(seasons_df), dtype=int)
for index, row in seasons_df.iterrows():
    if isinstance(row['Player'], str) == False:
        continue 
    if '*' in row['Player']:
        seasons_df.loc[index, 'Player'] = row['Player'][:len(row['Player'])-1]
        seasons_df.loc[index, 'All Star Season'] = 1
seasons_df.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,All Star Season
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0,0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0,0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0,0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0,0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0,0


In [5]:
url = 'https://www.basketball-reference.com/awards/dpoy.html'
output = str(BeautifulSoup(urlopen(url), 'html.parser').findAll('table', id='dpoy_NBA')[0])
dpoyframe = pd.read_html(output)[0]
dpoyframe.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Per Game,Per Game,Per Game,Per Game,Per Game,Per Game,Shooting,Shooting,Shooting,Advanced,Advanced
Unnamed: 0_level_1,Season,Lg,Player,Voting,Age,Tm,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,2021-22,NBA,Marcus Smart,(V),27,BOS,71,32.3,12.1,3.8,5.9,1.7,0.3,0.418,0.331,0.793,5.6,0.116
1,2020-21,NBA,Rudy Gobert,(V),28,UTA,71,30.8,14.3,13.5,1.3,0.6,2.7,0.675,0.0,0.623,11.3,0.248
2,2019-20,NBA,Giannis Antetokounmpo,(V),25,MIL,63,30.4,29.5,13.6,5.6,1.0,1.0,0.553,0.304,0.633,11.1,0.279
3,2018-19,NBA,Rudy Gobert,(V),26,UTA,81,31.8,15.9,12.9,2.0,0.8,2.3,0.669,,0.636,14.4,0.268
4,2017-18,NBA,Rudy Gobert,(V),25,UTA,56,32.4,13.5,10.7,1.4,0.8,2.3,0.622,,0.682,8.1,0.214


In [6]:
url = 'https://www.basketball-reference.com/awards/mvp.html'
output = str(BeautifulSoup(urlopen(url), 'html.parser').findAll('table', id='mvp_NBA')[0])
mvpframe = pd.read_html(output)[0]
mvpframe.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Per Game,Per Game,Per Game,Per Game,Per Game,Per Game,Shooting,Shooting,Shooting,Advanced,Advanced
Unnamed: 0_level_1,Season,Lg,Player,Voting,Age,Tm,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,2021-22,NBA,Nikola Jokić,(V),26,DEN,74,33.5,27.1,13.8,7.9,1.5,0.9,0.583,0.337,0.81,15.2,0.296
1,2020-21,NBA,Nikola Jokić,(V),25,DEN,72,34.6,26.4,10.8,8.3,1.3,0.7,0.566,0.388,0.868,15.6,0.301
2,2019-20,NBA,Giannis Antetokounmpo,(V),25,MIL,63,30.4,29.5,13.6,5.6,1.0,1.0,0.553,0.304,0.633,11.1,0.279
3,2018-19,NBA,Giannis Antetokounmpo,(V),24,MIL,72,32.8,27.7,12.5,5.9,1.3,1.5,0.578,0.256,0.729,14.4,0.292
4,2017-18,NBA,James Harden,(V),28,HOU,72,35.4,30.4,5.4,8.8,1.8,0.7,0.449,0.367,0.858,15.4,0.289


In [19]:
# This DataFrame shows the quartiles of different MVPs
df1 = mvpframe['Per Game']
df1.describe()

Unnamed: 0,MP,PTS,TRB,AST,STL,BLK
count,67.0,67.0,67.0,67.0,49.0,49.0
mean,38.940299,26.085075,12.21194,5.635821,1.522449,1.353061
std,3.731198,5.155675,6.131304,2.709395,0.5463,1.084954
min,30.4,13.8,3.3,1.3,0.5,0.1
25%,36.85,23.85,7.35,3.75,1.1,0.6
50%,38.3,26.4,10.8,5.0,1.6,0.9
75%,41.1,29.7,15.65,7.25,1.8,2.0
max,47.3,37.6,27.0,12.8,3.2,4.1
