In [None]:
#the dataset is about basketball players in WNBA and
#their metrics in 2016-1017 season
#exploring the data set

import pandas as pd
wnba = pd.read_csv('wnba.csv')
print(wnba.head(5))
print(wnba.tail(5))
wnba.shape


In [None]:
#sampling the dataset and measuring the sampling error
parameter = wnba['Games Played'].max()
sample = wnba.sample(30, random_state = 1)
statistic = sample['Games Played'].max()
sampling_error = parameter - statistic

In [None]:
#working with SRS simple Random Sampling
#visualizing the discrepancy between a parameter and its corresponding statistics
#in the case of simple random sampling

import pandas as pd
import matplotlib.pyplot as plt

wnba = pd.read_csv('wnba.csv')
pts_means = []
for n in range(100):
    pts_means.append(wnba['PTS'].sample(10, random_state = n).mean())
    
plt.scatter(range(0,100), pts_means)
plt.axhline(wnba['PTS'].mean())



In [None]:
#stratified sampling
#stratify the dataset based on player position and then performing SRS on each stratum

wnba['points_per_game'] = wnba['PTS']/wnba['Games Played'] #points scored per match

pos_dict={}
for n in wnba['Pos'].unique():
    pos_dict[n] = wnba[wnba['Pos']== n].sample(10, random_state = 0)['points_per_game'].mean()

    
    #find the position that scores the highest points per game    
import operator
position_most_points = max(pos_dict.items(), key=operator.itemgetter(1))[0]

In [None]:
#stratified sampling with respect to proportions in the population

#percentage of the games played
print(wnba['Games Played'].value_counts(bins = 3, normalize = True) * 100) 

#sampling strata proportionally by the number of games played
wnba['Games Played'].value_counts(bins = 3)
stra1 = wnba[wnba['Games Played'] <= 12]
stra2 = wnba[ (wnba['Games Played'] <= 22) & (wnba['Games Played'] >12)]
stra3 = wnba[wnba['Games Played'] > 22]

mean_final_sample =[]
for n in range(100):
    a = stra1.sample(1, random_state = n)
    b = stra2.sample(2, random_state = n)
    c = stra3.sample(7, random_state = n)
    final_sample = pd.concat([a,b,c])
    mean_final_sample.append(final_sample['PTS'].mean())
    
plt.scatter(range(1,101), mean_final_sample)
plt.axhline(wnba['PTS'].mean())

In [None]:
#cluster sampling

cluster =pd.Series(wnba['Team'].unique()).sample(4, random_state = 0)
sample = pd.DataFrame()
for c in cluster:
    sample = sample.append(wnba[wnba['Team'] == c])
    
    
sampling_error_height = wnba['Height'].mean() - sample['Height'].mean()
sampling_error_age = wnba['Age'].mean() - sample['Age'].mean()
sampling_error_BMI = wnba['BMI'].mean() - sample['BMI'].mean()
sampling_error_points = wnba['PTS'].mean() - sample['PTS'].mean()