In [79]:
import pandas as pd
import json

In [80]:
file_2021 = open('wnba_betting_data/2021/player_game_stats_2021.csv')
file_2022 = open('wnba_betting_data/2022/player_game_stats_2022.csv')
file_2023 = open('wnba_betting_data/2023/player_game_stats_2023.csv')
file_2024 = open('wnba_betting_data/2024/player_game_stats_2024.csv')


In [81]:
df_2021 = pd.read_csv(file_2021)
df_2022 = pd.read_csv(file_2022)
df_2023 = pd.read_csv(file_2023)
df_2024 = pd.read_csv(file_2024)


## Sumary of 3 point statistics for the 2021 season
**Calculate:**
- Average 3 points made -> in total | Starters only | Non-Starters
- Average 3 points attempted -> in total | Starters only | Non-Starters

- Average time played for starter -> Starters only
- Average time played for non-starters -> Non Starters only


In [82]:
# average 3 points made -> No category
print(df_2021['three_points_made'].mean())

# average 3 points attempted -> No Category
print(df_2021['three_points_att'].mean())

0.6219269102990034
1.8110741971207087


In [83]:
# A better format for stuff above
df_2021.describe()


Unnamed: 0,three_points_made,three_points_att,points
count,4515.0,4515.0,4515.0
mean,0.621927,1.811074,6.899668
std,1.068296,2.34402,7.327571
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,5.0
75%,1.0,3.0,11.0
max,7.0,14.0,37.0


In [84]:
df_2022.describe()  


Unnamed: 0,three_points_made,three_points_att,points
count,5064.0,5064.0,5064.0
mean,0.667457,1.93049,7.066746
std,1.10803,2.42035,7.240643
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,5.0
75%,1.0,3.0,12.0
max,8.0,17.0,38.0


In [85]:
df_2023.describe()


Unnamed: 0,three_points_made,three_points_att,points
count,5732.0,5732.0,5732.0
mean,0.647069,1.868109,6.946615
std,1.142524,2.472402,7.7683
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,4.5
75%,1.0,3.0,11.0
max,10.0,21.0,53.0


In [86]:
df_2024.describe()

Unnamed: 0,three_points_made,three_points_att,points
count,5661.0,5661.0,5661.0
mean,0.659071,1.948596,6.964494
std,1.168038,2.570629,7.686755
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,4.0
75%,1.0,3.0,12.0
max,9.0,17.0,42.0


#### Normal boolean indexing to find 3 points made and attempted: 

In [87]:
print(df_2021[df_2021['starter'] == True]['three_points_made'].mean())
print(df_2021[df_2021['starter'] == True]['three_points_att'].mean())

print('\n-----------------\n')

print(df_2021[df_2021['starter'] == False]['three_points_made'].mean())
print(df_2021[df_2021['starter'] == False]['three_points_att'].mean())



1.0316062176165803
2.981865284974093

-----------------

0.3160541586073501
0.9369439071566731


Starters are close to 3x better than the rest -> A lot 

## Different method using groupy for each year 


In [88]:
print(df_2021.groupby('starter')['three_points_made'].mean())
print('\n-----------------\n')
print(df_2021.groupby('starter')['three_points_att'].mean())

starter
False    0.316054
True     1.031606
Name: three_points_made, dtype: float64

-----------------

starter
False    0.936944
True     2.981865
Name: three_points_att, dtype: float64


In [89]:
print(df_2022.groupby('starter')['three_points_made'].mean())
print('\n-----------------\n')
print(df_2022.groupby('starter')['three_points_att'].mean())

starter
False    0.311334
True     1.142396
Name: three_points_made, dtype: float64

-----------------

starter
False    0.937111
True     3.255300
Name: three_points_att, dtype: float64


In [90]:
print(df_2023.groupby('starter')['three_points_made'].mean())
print('\n-----------------\n')
print(df_2023.groupby('starter')['three_points_att'].mean())

starter
False    0.272509
True     1.167083
Name: three_points_made, dtype: float64

-----------------

starter
False    0.843637
True     3.290417
Name: three_points_att, dtype: float64


In [91]:
print(df_2024.groupby('starter')['three_points_made'].mean())
print('\n-----------------\n')
print(df_2024.groupby('starter')['three_points_att'].mean())

starter
False    0.276530
True     1.175104
Name: three_points_made, dtype: float64

-----------------

starter
False    0.85020
True     3.43029
Name: three_points_att, dtype: float64


In [92]:
df_2021['minutes'].dtype

dtype('O')

## Convert time from string to float to understand how long starters play compared to normal players

In [93]:
def time_convert(time_str):
    minutes, seconds = map(float, time_str.split(':'))
    time =  minutes + seconds/60
    return round(time, 2)


In [94]:
df_2021['minutes'].apply(time_convert) 


0       14.37
1       28.70
2        0.00
3       36.92
4        8.65
        ...  
4510    22.10
4511     0.00
4512    37.80
4513     0.00
4514    34.70
Name: minutes, Length: 4515, dtype: float64

In [95]:
# How much time did a starter play?
df_2021[df_2021['starter'] == True]['minutes'].apply(time_convert).mean()

np.float64(28.331398963730567)

In [96]:
# How much time did a non-starter play?
df_2021[df_2021['starter'] == False]['minutes'].apply(time_convert).mean()

np.float64(8.960255319148937)

### Apply groupby function with lambda to do this faster for all years
- The lambda function needs an apply because the groupby function gives the values in groups
- The argument x in the lambda function takes each group as a series and applies the funtion on each group -> in this case 'True' or 'False'


In [97]:
df_2021.groupby('starter')['minutes'].apply(lambda x : x.apply(time_convert).mean())

starter
False     8.960255
True     28.331399
Name: minutes, dtype: float64

In [98]:
df_2022.groupby('starter')['minutes'].apply(lambda x : x.apply(time_convert).mean())

starter
False     9.104036
True     28.130594
Name: minutes, dtype: float64

In [99]:
df_2023.groupby('starter')['minutes'].apply(lambda x : x.apply(time_convert).mean())

starter
False     8.059556
True     29.014171
Name: minutes, dtype: float64

In [100]:
df_2024.groupby('starter')['minutes'].apply(lambda x : x.apply(time_convert).mean())

starter
False     8.239357
True     29.089340
Name: minutes, dtype: float64

#### From this we can tell that the starter players get 3x more time and hence more opportunities

## Plot graphs for visual representations
Graphs to see visually what's going on
- Starters and their time played
- Starters and their 3 points made and attempted
- Teams and their 3PAs and 3PMs
- Create new CSVs with position and add it


### Starters & Time Played