# Basketball Playoffs

Basketball tournaments are usually split in two parts. First, all teams play each other aiming to achieve the greatest number of wins possible. Then, at the end of the first part of the season, a pre determined number of teams which were able to win the most games are qualified to the playoff season, where they play series of knock-out matches for the trophy.

For the 10 years, data from players, teams, coaches, games and several other metrics were gathered and arranged on this dataset. The goal is to use this data to predict which teams will qualify for the playoffs in the next season.

In [90]:
# Imports
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sb

## Data Preprocess

### Exploratory Data Analysis

In [91]:
df_teams = pd.read_csv('dataset/teams.csv')
df_players = pd.read_csv('dataset/players.csv')
df_coaches = pd.read_csv('dataset/coaches.csv')
df_players_teams = pd.read_csv('dataset/players_teams.csv')
df_awards_players = pd.read_csv('dataset/awards_players.csv')
df_series_post = pd.read_csv('dataset/series_post.csv')
df_teams_post = pd.read_csv('dataset/teams_post.csv')

df_collection = [df_teams, df_players, df_coaches, df_players_teams, df_awards_players, df_series_post, df_teams_post]

for dataframe in df_collection:
    dataframe.drop_duplicates(inplace=True)
pd.set_option('display.max_rows', None)
# df_teams.head()

df_teams.isna().sum()

year            0
lgID            0
tmID            0
franchID        0
confID          0
divID         142
rank            0
playoff         0
seeded          0
firstRound     62
semis         104
finals        122
name            0
o_fgm           0
o_fga           0
o_ftm           0
o_fta           0
o_3pm           0
o_3pa           0
o_oreb          0
o_dreb          0
o_reb           0
o_asts          0
o_pf            0
o_stl           0
o_to            0
o_blk           0
o_pts           0
d_fgm           0
d_fga           0
d_ftm           0
d_fta           0
d_3pm           0
d_3pa           0
d_oreb          0
d_dreb          0
d_reb           0
d_asts          0
d_pf            0
d_stl           0
d_to            0
d_blk           0
d_pts           0
tmORB           0
tmDRB           0
tmTRB           0
opptmORB        0
opptmDRB        0
opptmTRB        0
won             0
lost            0
GP              0
homeW           0
homeL           0
awayW           0
awayL     

In [92]:
df_teams.replace("", np.nan, inplace=True)
df_teams.dropna(axis=1, how='all', inplace=True)

if (df_teams['seeded'] == 0).all():
    df_teams.drop('seeded', axis=1, inplace=True)

#df_teams.value_counts('franchID')

cols_to_remove = ['lgID', 'attend', 'arena']

df_teams.drop(cols_to_remove, axis=1, inplace=True)

df_teams.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,won,lost,GP,homeW,homeL,awayW,awayL,confW,confL,min
0,9,ATL,ATL,EA,7,N,,,,Atlanta Dream,...,4,30,34,1,16,3,14,2,18,6825
1,10,ATL,ATL,EA,2,Y,L,,,Atlanta Dream,...,18,16,34,12,5,6,11,10,12,6950
2,1,CHA,CHA,EA,8,N,,,,Charlotte Sting,...,8,24,32,5,11,3,13,5,16,6475
3,2,CHA,CHA,EA,4,Y,W,W,L,Charlotte Sting,...,18,14,32,11,5,7,9,15,6,6500
4,3,CHA,CHA,EA,2,Y,L,,,Charlotte Sting,...,18,14,32,11,5,7,9,12,9,6450


In [93]:
cols_to_remove = ['college','collegeOther','birthDate','deathDate']

df_players.drop(cols_to_remove, axis=1, inplace=True)

df_players.head()

Unnamed: 0,bioID,pos,firstseason,lastseason,height,weight
0,abrahta01w,C,0,0,74.0,190
1,abrossv01w,F,0,0,74.0,169
2,adairje01w,C,0,0,76.0,197
3,adamsda01w,F-C,0,0,73.0,239
4,adamsjo01w,C,0,0,75.0,180


In [94]:
if df_coaches['stint'].nunique() == 1:
    df_coaches.drop('stint', axis=1, inplace=True)

df_coaches.drop('lgID', axis=1, inplace=True)

df_coaches.head()

Unnamed: 0,coachID,year,tmID,stint,won,lost,post_wins,post_losses
0,adamsmi01w,5,WAS,0,17,17,1,2
1,adubari99w,1,NYL,0,20,12,4,3
2,adubari99w,2,NYL,0,21,11,3,3
3,adubari99w,3,NYL,0,18,14,4,4
4,adubari99w,4,NYL,0,16,18,0,0


In [95]:
# df_players['height'].value_counts()

df_players.head(10)
df_players.replace({'pos': ""}, np.nan, inplace=True)

remov_conditions = df_players[(df_players['height'] <= 0.0) | (df_players['weight'] <= 0) | (df_players['pos'].isna())].index
df_players.drop(remov_conditions , inplace=True)
df_players.head(10)

Unnamed: 0,bioID,pos,firstseason,lastseason,height,weight
0,abrahta01w,C,0,0,74.0,190
1,abrossv01w,F,0,0,74.0,169
2,adairje01w,C,0,0,76.0,197
3,adamsda01w,F-C,0,0,73.0,239
4,adamsjo01w,C,0,0,75.0,180
8,aguilel01w,G,0,0,67.0,165
9,ajavoma01w,G,0,0,68.0,160
11,aldrima01w,G,0,0,71.0,153
12,alexaer01w,G,0,0,67.0,140
13,alhalta01w,F-G,0,0,72.0,149


In [96]:
for col in df_awards_players.columns:
    if len(df_awards_players[col].unique()) == 1:
        df_awards_players.drop(col, inplace=True, axis=1)

df_awards_players.head()

Unnamed: 0,playerID,award,year
0,thompti01w,All-Star Game Most Valuable Player,1
1,leslili01w,All-Star Game Most Valuable Player,2
2,leslili01w,All-Star Game Most Valuable Player,3
3,teaslni01w,All-Star Game Most Valuable Player,4
4,swoopsh01w,All-Star Game Most Valuable Player,6


In [97]:
df_teams_post.drop('lgID', axis=1, inplace=True)
df_teams_post.head()

Unnamed: 0,year,tmID,W,L
0,1,HOU,6,0
1,1,ORL,1,2
2,1,CLE,3,3
3,1,WAS,0,2
4,1,NYL,4,3


In [98]:
for col in df_series_post.columns:
    if len(df_series_post[col].unique()) == 1:
        df_series_post.drop(col, inplace=True, axis=1)

df_series_post.head()

Unnamed: 0,year,round,series,tmIDWinner,tmIDLoser,W,L
0,1,FR,A,CLE,ORL,2,1
1,1,FR,B,NYL,WAS,2,0
2,1,FR,C,LAS,PHO,2,0
3,1,FR,D,HOU,SAC,2,0
4,1,CF,E,HOU,LAS,2,0


In [99]:
df_players_teams.drop('lgID', axis=1, inplace=True)
df_players_teams.head()

Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostBlocks,PostTurnovers,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0,0
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0,0
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,1,8,8,22,6,8,8,7,3,0
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,2,3,7,23,8,4,2,8,2,0
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0,0


In [100]:
df_teams.to_csv('dataset/processed/teams_processed.csv', index=False)
df_players.to_csv('dataset/processed/players_processed.csv', index=False)
df_coaches.to_csv('dataset/processed/coaches_processed.csv', index=False)
df_awards_players.to_csv('dataset/processed/awards_processed.csv', index=False)
df_teams_post.to_csv('dataset/processed/teams_post_processed.csv', index=False)
df_series_post.to_csv('dataset/processed/series_post_processed.csv', index=False)
df_players_teams.to_csv('dataset/processed/players_teams_processed.csv', index=False)