# **CBB Statistics**
### Final Project Notebook

### **Hypothesis:**
- *Null Hypothesis:* There are no distinct clusters among NCAA basketball teams based on season statistics.
- *Alternative Hypothesis:* There are distinct clusters among NCAA basketball teams based on season statistics.

## Imports

In [1]:
#imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import shapiro
from scipy.stats import kruskal
from scipy.stats import chi2_contingency

## **Dataset**

*explaination of dataset*

In [2]:
basketball_df = pd.read_csv('cbb.csv')
basketball_df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


In [3]:
# Basketball dataset with only numerical columns
basketball_df_numerical = basketball_df.select_dtypes(include='number')
basketball_df_numerical.head()

Unnamed: 0,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,...,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,SEED,YEAR
0,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,40.7,...,32.3,30.4,53.9,44.6,32.7,36.2,71.7,8.6,1.0,2016
1,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,32.1,...,36.2,22.4,54.8,44.7,36.5,37.5,59.3,11.3,1.0,2015
2,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,25.5,...,30.7,30.0,54.7,46.8,35.2,33.2,65.9,6.9,3.0,2018
3,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,27.4,...,32.9,36.6,52.8,41.9,36.5,29.7,67.5,7.0,3.0,2019
4,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,30.0,...,39.0,26.9,56.3,40.0,38.2,29.0,71.5,7.7,1.0,2017


**Categorizing the variables into numerical and categorical**

In [4]:
numerical_vars = ["G", "W", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D", "TOR", "TORD", 
                  "ORB", "DRB", "FTR", "FTRD", "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"]
categorical_vars = ["CONF", "POSTSEASON", "SEED", "YEAR"]

## **Data Preprocessing**

*short explaination of what we did here* 

identification of required features, proper handling of missing
values (removal, imputation, etc.), and supporting documentation for your decisions
such as the number of missing values relative to the size of the dataset.

##### **Missing Values Check** 

Both the Seed and Postseason columns are missing the same amount of values that make up 80.7% of there entires. The missing values occur because the majority of teams don't make the NCAA Tournament. Since only qualifying teams are assigned a tournament seed and have postseason results, the absence of data in these columns is intentional and meaningful. Therefore, for this case, these are no truly "missing" values. For the analysis, these values were replaces with "No Tournament" in Postseason and "None" in Seed.

In [5]:
#Check for missing values
missing_counts = basketball_df.isnull().sum()
missing_percent = (missing_counts / len(basketball_df)) * 100
missing_values = pd.DataFrame({
    'Missing Count': missing_counts, 
    '% Missing': missing_percent
    })
missing_values.head(len(missing_values))

Unnamed: 0,Missing Count,% Missing
TEAM,0,0.0
CONF,0,0.0
G,0,0.0
W,0,0.0
ADJOE,0,0.0
ADJDE,0,0.0
BARTHAG,0,0.0
EFG_O,0,0.0
EFG_D,0,0.0
TOR,0,0.0


In [6]:
#replace missing values
basketball_df['SEED'] = basketball_df['SEED'].fillna("None")
basketball_df['POSTSEASON'] = basketball_df['POSTSEASON'].fillna("No Tournament")
basketball_df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


##### **Scale Numerical Features**

In [7]:
scaler = StandardScaler()
basketball_df[numerical_vars] = scaler.fit_transform(basketball_df[numerical_vars])
basketball_df.head(len(basketball_df))

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,2.407925,2.588173,2.773838,-1.267573,1.798002,0.881130,-0.686500,-1.566916,...,-0.688346,1.408147,-1.413912,-0.544408,0.751759,1.282537,2.374116,2ND,1.0,2016
1,Wisconsin,B10,2.407925,3.044657,3.572315,-1.467233,1.886916,1.595864,-0.824691,-2.992893,...,-1.939157,1.672860,-1.383761,0.848146,1.285596,-2.728767,2.770300,2ND,1.0,2015
2,Michigan,B10,2.407925,2.588173,1.548589,-1.958704,1.736898,1.303473,-0.824691,-2.232372,...,-0.750886,1.643447,-0.750571,0.371746,-0.480172,-0.593718,2.124666,2ND,3.0,2018
3,Texas Tech,B12,1.901727,2.283849,1.658723,-2.757345,1.862631,1.173521,-2.448428,-0.473666,...,0.281033,1.084609,-2.228014,0.848146,-1.917426,-0.076130,2.139340,2ND,3.0,2019
4,Gonzaga,WCC,2.154826,3.196819,2.016661,-2.588402,1.875165,2.180646,-3.104833,-1.186655,...,-1.235576,2.114049,-2.800900,1.471131,-2.204877,1.217839,2.242054,2ND,1.0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3518,Toledo,MAC,0.889331,1.675203,2.305765,0.990122,0.951161,2.083183,0.971785,-2.422502,...,-1.141765,1.614035,0.847480,2.020823,0.710695,0.570854,0.936113,No Tournament,,2023
3519,Liberty,ASun,0.636232,1.675203,1.135583,-0.898970,1.294676,1.823279,-0.271929,-1.281720,...,-1.094859,2.143461,-0.207837,0.811500,-0.315915,-1.078956,0.818725,No Tournament,,2023
3520,Utah Valley,WAC,0.889331,1.827365,0.543609,-1.313648,1.223779,0.588739,-2.102952,0.286855,...,-0.954143,0.996371,-1.956647,-0.287885,-1.342525,0.667902,1.068175,No Tournament,,2023
3521,UAB,CUSA,1.901727,1.979526,1.273252,-0.945045,1.375756,0.133908,-0.962881,-0.663797,...,-0.922873,-0.091894,-0.629963,0.518330,-1.137203,0.959045,1.038828,No Tournament,,2023


## **Data analysis and visualization** 

Quantitative (statistical) and visual analysis of your data.

##### **Shapiro-Wilk Test** 

Testing for normal distribution

In [10]:
#Normality test using Shapiro-Wilk test
shapiro_results = {}
for var in numerical_vars:
    stat, p = shapiro( basketball_df_numerical[var])
    shapiro_results[var] = {
        'Shapiro-Wilk Statistic': stat,
        'p-value': p,
        'Is Normal': p > 0.05
    }
shapiro_df = pd.DataFrame(shapiro_results).T.sort_values('p-value')
shapiro_df.head(len(shapiro_df))

Unnamed: 0,Shapiro-Wilk Statistic,p-value,Is Normal
G,0.91468,0.0,False
BARTHAG,0.958721,0.0,False
FTRD,0.985712,0.0,False
W,0.990524,0.0,False
WAB,0.991769,0.0,False
TORD,0.992044,0.0,False
FTR,0.99519,0.0,False
TOR,0.995449,0.0,False
ADJ_T,0.995644,0.0,False
ADJOE,0.997174,4e-06,False


##### **Kruskal Wallace Test**

In [11]:
kruskal_results = []

for var in numerical_vars:
    samples_by_group = []
    for value in set(basketball_df['POSTSEASON']):
        mask = basketball_df['POSTSEASON'] == value
        samples_by_group.append(basketball_df[var][mask])
    
    stat, p_value = kruskal(*samples_by_group)
    
    kruskal_results_dict = {
        'Variable': var,
        'Test Statistic': stat,
        'p-value': p_value,
        'Significant': p_value < (0.05 / len(numerical_vars))  #bonferroni correction
    }
    
    kruskal_results.append(kruskal_results_dict)

kruskal_df = pd.DataFrame(kruskal_results).sort_values('p-value')
kruskal_df.head(len(kruskal_df))


Unnamed: 0,Variable,Test Statistic,p-value,Significant
18,WAB,1312.405503,4.8945859999999994e-278,True
1,W,1196.2692,6.135774000000001e-253,True
4,BARTHAG,1178.19309,4.934498e-249,True
2,ADJOE,994.786253,1.992874e-209,True
3,ADJDE,894.333056,9.423624e-188,True
0,G,672.041358,7.462822e-140,True
6,EFG_D,572.511733,1.893774e-118,True
14,2P_D,452.118644,1.300098e-92,True
5,EFG_O,440.995752,3.140315e-90,True
13,2P_O,386.292923,1.5989299999999998e-78,True


##### **Chi-Square Test**

In [14]:
#categorical variable and POSTSEASON
chi2_results = []
for var in categorical_vars:
    combo_counts = pd.crosstab(basketball_df[var], basketball_df['POSTSEASON'])
    chi2, p, _, _ = chi2_contingency(combo_counts)
    chi2_results_dict ={
        'Variable': var,
        'Chi2 Statistic': chi2,
        'p-value': p,
        'Significant': p < (0.05 / len(categorical_vars))  #bonferroni correction
    }
    chi2_results.append(chi2_results_dict)

chi2_df = pd.DataFrame(chi2_results).sort_values('p-value')
chi2_df.head(len(chi2_df))

Unnamed: 0,Variable,Chi2 Statistic,p-value,Significant
1,POSTSEASON,28184.0,0.0,True
2,SEED,6449.853096,0.0,True
0,CONF,973.09471,1.471978e-79,True
3,YEAR,0.14196,1.0,False
