## Hypothesis Testing

In [9]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
# package with hypothesis tests
import scipy.stats as st

### Data

You can download the data from [**here**](https://drive.google.com/file/d/19b9lHlkixZhs8yka8zV0QFieao66dUcY/view?usp=sharing). The data contains results of NBA games from seasons 2013/2014 to 2015/2016.

### Task
Split the data into **3** separate files according to the season!

In [5]:
nba_csv = pd.read_csv('nba_games_2013_2015.csv', sep=';')
nba_csv.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,...,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,...,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,...,0.864,10,29,39,20,7,3,7,20,10.0
3,22015,1610612747,LAL,Los Angeles Lakers,21501228,2016-04-13,LAL vs. UTA,W,239,101,...,0.867,8,39,47,19,6,3,13,17,5.0
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,...,0.733,8,35,43,21,4,7,10,23,-2.0


In [6]:
nba_csv['SEASON_ID'].unique()

array([22015, 22014, 22013])

In [33]:
nba_2013 = nba_csv[nba_csv['SEASON_ID']==22013].copy()
nba_2014 = nba_csv[nba_csv['SEASON_ID']==22014].copy()
nba_2015 = nba_csv[nba_csv['SEASON_ID']==22015].copy()
nba_2015.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,56,86,0.651,13,28,0.464,19,23,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,37,87,0.425,7,23,0.304,11,13,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,37,86,0.43,5,14,0.357,19,22,0.864,10,29,39,20,7,3,7,20,10.0
3,22015,1610612747,LAL,Los Angeles Lakers,21501228,2016-04-13,LAL vs. UTA,W,239,101,41,85,0.482,6,25,0.24,13,15,0.867,8,39,47,19,6,3,13,17,5.0
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,46,97,0.474,7,18,0.389,11,15,0.733,8,35,43,21,4,7,10,23,-2.0


In [48]:
CC_filter = (nba_2015['TEAM_NAME'] == 'Cleveland Cavaliers') 
GSW_filter = (nba_2015['TEAM_NAME'] == 'Golden State Warriors')
CC_df15 = nba_2015[CC_filter]
GSW_df15 = nba_2015[GSW_filter]
CC_df15.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,46,97,0.474,7,18,0.389,11,15,0.733,8,35,43,21,4,7,10,23,-2.0
40,22015,1610612739,CLE,Cleveland Cavaliers,21501203,2016-04-11,CLE vs. ATL,W,240,109,40,83,0.482,11,29,0.379,18,20,0.9,9,38,47,17,9,4,15,14,15.0
78,22015,1610612739,CLE,Cleveland Cavaliers,21501191,2016-04-09,CLE @ CHI,L,242,102,36,83,0.434,19,40,0.475,11,18,0.611,12,30,42,24,5,5,15,18,-3.0
127,22015,1610612739,CLE,Cleveland Cavaliers,21501165,2016-04-06,CLE @ IND,L,239,109,35,74,0.473,8,29,0.276,31,34,0.912,7,26,33,15,7,3,10,19,-14.0
151,22015,1610612739,CLE,Cleveland Cavaliers,21501159,2016-04-05,CLE @ MIL,W,241,109,41,75,0.547,18,36,0.5,9,10,0.9,2,39,41,30,4,8,16,14,29.0


### Task
Test the hypothesis that offensive productions of Cleveland Cavaliers and Golden State Warriors (teams that met in the finals that year) were distributed equally in 2015/2016.

Offensive production consists of two variables: PTS (Points) and FG_PCT (Field Goal Percentage). We need to do two separate hypothesis tests, one for each variable.

In [None]:
# Null Hypothesis 1 PTS(CC) == PTS(GSW)
# Null Hypothesis 2 FG_PCT(CC) == FG_PCT(GSW)

In [49]:
# H1 PTS
CC_pts = CC_df15['PTS'].tolist()
GSW_pts = GSW_df15['PTS'].tolist()
#ttest_pts = st.stats.ttest_ind(CC_pts,GSW_pts)
pval_pts = st.stats.ttest_ind(CC_pts,GSW_pts)[1]
if pval_pts > 0.05:
    print('Accept Null Hypothesis of equivalency in PTS (Points)')
elif pval_pts < 0.05:
    print('Reject Null Hypothesis, PTS (Points) NOT equivalent')

Reject Null Hypothesis, PTS (Points) NOT equivalent


In [50]:
# H2 FG_PCT
CC_fgpct = CC_df15['FG_PCT'].tolist()
GSW_fgpct = GSW_df15['FG_PCT'].tolist()
#ttest_pts = st.stats.ttest_ind(CC_pts,GSW_pts)
pval_fgpct = st.stats.ttest_ind(CC_fgpct,GSW_fgpct)[1]
if pval_fgpct > 0.05:
    print('Accept Null Hypothesis of equivalency in FG_PCT (Field Goal Percentage)')
elif pval_fgpct < 0.05:
    print('Reject Null Hypothesis, FG_PCT (Field Goal Percentage) NOT equivalent')

Reject Null Hypothesis, FG_PCT (Field Goal Percentage) NOT equivalent


### Task
Test the hypothesis that the number of points (PTS) scored by Cleveland Cavaliers changed significantly after the coach change in the season 2015/2016. **Coach Blatt was fired on 24th of Jan, 2016**. Use the data from seasons 2014/2015 and 2015/2016 - those are years when Cleveland was coached by Blatt. 

We have two possible solutions here:
- take the same amount of games from before and after and try t-test.
- take all the games from before and after and look for the right test to compare two samples with different sizes

In [None]:
# Null Hypothesis (PTS beforeJan24,2016) > (PTS after Jan24,2016)


In [73]:
CC_filter = (nba_2014['TEAM_NAME'] == 'Cleveland Cavaliers') 
CC_df14 = nba_2014[CC_filter]
pts_before1 = CC_df14['PTS'].tolist()

before_filter = CC_df15['GAME_DATE'] < '2016-01-24'
after_filter = CC_df15['GAME_DATE'] < '2016-01-24'
CC15_before = CC_df15[before_filter]
pts_before2 = CC15_before['PTS'].tolist()
CC15_after = CC_df15[after_filter]

pts_after = CC15_after['PTS'].tolist()
pts_before = pts_before1 + pts_before2


In [75]:
pval_pts = st.stats.ttest_ind(pts_before,pts_after)[1]
if pval_pts > 0.05:
    print('Accept Null Hypothesis: PTS (Points) before Jan 24, 2016 HIGHER with Coach Blatt')
elif pval_pts < 0.05:
    print('Reject Null Hypothesis: PTS (Points) did not change after firing Coach Blatt')

Accept Null Hypothesis: PTS (Points) before Jan 24, 2016 HIGHER with Coach Blatt


### Task
Download [**the similar dataset**](https://drive.google.com/file/d/1jY57bAOZp9y83b4W2PAoSH1uFARaxxls/view?usp=sharing) with scores from playoff games in 2016.

In [76]:
playoffs16 = pd.read_csv('nba_games_2013_2015.csv', sep=';')
playoffs16.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22015,1610612750,MIN,Minnesota Timberwolves,21501226,2016-04-13,MIN vs. NOP,W,240,144,56,86,0.651,13,28,0.464,19,23,0.826,5,38,43,41,14,8,13,20,35.0
1,22015,1610612749,MIL,Milwaukee Bucks,21501225,2016-04-13,MIL vs. IND,L,240,92,37,87,0.425,7,23,0.304,11,13,0.846,7,36,43,23,8,3,15,15,-5.0
2,22015,1610612738,BOS,Boston Celtics,21501217,2016-04-13,BOS vs. MIA,W,240,98,37,86,0.43,5,14,0.357,19,22,0.864,10,29,39,20,7,3,7,20,10.0
3,22015,1610612747,LAL,Los Angeles Lakers,21501228,2016-04-13,LAL vs. UTA,W,239,101,41,85,0.482,6,25,0.24,13,15,0.867,8,39,47,19,6,3,13,17,5.0
4,22015,1610612739,CLE,Cleveland Cavaliers,21501220,2016-04-13,CLE vs. DET,L,265,110,46,97,0.474,7,18,0.389,11,15,0.733,8,35,43,21,4,7,10,23,-2.0


### Task
Test the hypothesis that number of blocks (BLK) are from the same distribution in both, in playoffs and in the reguar season 2015/2016 for **Toronto Raptors**. We need to work with two samples with different sizes again.

In [79]:
sn_filter = nba_2015['TEAM_NAME'] == 'Toronto Raptors'
po_filter = playoffs16['TEAM_NAME'] == 'Toronto Raptors'
blk_ssn = nba_2015[sn_filter]['BLK'].tolist()
blk_poff = playoffs16[po_filter]['BLK'].tolist()

# Null Hypothesis: Number of BLK (blocks) in season == playoffs for Toronto Raptors
pval_raptors = st.stats.ttest_ind(blk_ssn,blk_poff)[1]
if pval_raptors > 0.05:
    print('Accept Null Hypothesis: Number of BLK (blocks) in season == playoffs for Toronto Raptors')
elif pval_raptors < 0.05:
    print('Reject Null Hypothesis: Number of BLK (blocks) in season =/= playoffs for Toronto Raptors')

Reject Null Hypothesis: Number of BLK (blocks) in season =/= playoffs for Toronto Raptors


### Task
Test the hypothesis that the number of points (PTS) scored by Cleveland Cavaliers is equally distributed for all 3 seasons. In this case, we need a hypothesis test that compares more than 2 distributions at the same.

In [80]:
CC_13 = nba_2013[nba_2013['TEAM_NAME'] == 'Cleveland Cavaliers']['PTS'].tolist()
CC_14 = nba_2014[nba_2014['TEAM_NAME'] == 'Cleveland Cavaliers']['PTS'].tolist()
CC_15 = nba_2015[nba_2015['TEAM_NAME'] == 'Cleveland Cavaliers']['PTS'].tolist()

In [83]:
# Null Hypothesis: All PTS (Points) in all 3 seasons are equally distributed
pval_CC = st.stats.f_oneway(CC_13, CC_14, CC_15)[1]
if pval_CC > 0.05:
    print('Accept Null Hypothesis: PTS (Points) in all 3 seasons are equally distributed')
elif pval_CC < 0.05:
    print('Reject Null Hypothesis: PTS (Points) in all 3 seasons are NOT equally distributed')

Reject Null Hypothesis: PTS (Points) in all 3 seasons are NOT equally distributed


#### Follow Up
Between which seasons can we see the significant difference?
+ unfortunatelly, this is not the output of the ANOVA test and further tests need to be run.
+ note that Lebron James came back to Cleveland prior to season 2014/2015. We can use this fact to interpret the results correctly.

In [86]:
# testing 13/14 to 14/15
pval_13thru15 = st.stats.ttest_ind(CC_13,CC_14)[1]
if pval_13thru15 > 0.05:
    print('Accept Null Hypothesis: PTS (Points) between 13/14 and 14/15 seasons are equally distributed')
elif pval_13thru15 < 0.05:
    print('Reject Null Hypothesis: PTS (Points) between 13/14 and 14/15 seasons are NOT equally distributed-- Lebron James saved the team from desperation')

Reject Null Hypothesis: PTS (Points) between 13/14 and 14/15 seasons are NOT equally distributed-- Lebron James saved the team from desperation


In [85]:
# testing 14/15 to 15/16
pval_14thru16 = st.stats.ttest_ind(CC_14, CC_15)[1]
if pval_14thru16 > 0.05:
    print('Accept Null Hypothesis: PTS (Points) between 14/15 and 15/16 seasons are equally distributed')
elif pval_14thru16 < 0.05:
    print('Reject Null Hypothesis: PTS (Points) between 14/15 and 15/16 seasons are NOT equally distributed')

Accept Null Hypothesis: PTS (Points) between 14/15 and 15/16 seasons are equally distributed
