In [14]:
#Monte Carlo Simulation

import pandas as pd
import numpy as np
import seaborn as sns
from numpy import nan

sns.set_style('whitegrid')

In [15]:
avg_male = 10
std_dev_male = .1
num_reps_male = 25

avg_female = 9.95
std_dev_female = .1
num_reps_female = 25

num_simulations = 100000

In [16]:
record_male = np.random.normal(avg_male, std_dev_male, num_reps_male).round(2)
record_female = np.random.normal(avg_female, std_dev_female, num_reps_female).round(2)

In [17]:
print(record_male[0:10])
print(record_female[0:10])

[  9.91  10.16   9.97  10.16   9.9    9.98   9.93   9.92   9.91   9.97]
[  9.94  10.     9.94   9.95   9.97   9.75   9.95  10.     9.8    9.9 ]


In [18]:
# Show how to create the dataframe
df_male = pd.DataFrame(index=range(num_reps_male), data={'record_male': record_male})
df_female = pd.DataFrame(index=range(num_reps_female), data={'record_female': record_female})

print(df_male.head())
print(df_female.head())

   record_male
0         9.91
1        10.16
2         9.97
3        10.16
4         9.90
   record_female
0           9.94
1          10.00
2           9.94
3           9.95
4           9.97


In [19]:
# Define a list to keep all the results from each simulation that we want to analyze
all_stats = []

# Loop through many simulations
for i in range(num_simulations):

    # Choose random inputs for
    record_male = np.random.normal(avg_male, std_dev_male, num_reps_male).round(2)
    record_female = np.random.normal(avg_female, std_dev_female, num_reps_female).round(2)

    # Build the dataframe based on the inputs and number of reps
    df_male = pd.DataFrame(index=range(num_reps_male), data={'record_male': record_male})

    df_female = pd.DataFrame(index=range(num_reps_female), data={'record_female': record_female})
    
    # We want to track over all the simulations
    all_stats.append([df_male['record_male'].mean(),
                      df_female['record_female'].mean(),
                    df_male['record_male'].max(),
                     df_female['record_female'].max(),
                     sorted(df_male['record_male'])[-2],
                      sorted(df_female['record_female'])[-2]])

In [20]:
results_df = pd.DataFrame.from_records(all_stats, columns=['record_male',
                                                           'record_female',
                                                          'best_male',
                                                          'best_female',
                                                          'runner_up_male',
                                                         'runner_up_female'])

In [21]:
results_df.describe()

Unnamed: 0,record_male,record_female,best_male,best_female,runner_up_male,runner_up_female
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,9.999974,9.949885,10.196612,10.14647,10.152281,10.102365
std,0.020095,0.020022,0.051056,0.050821,0.038479,0.038372
min,9.9056,9.8708,10.04,9.99,10.02,9.97
25%,9.9864,9.9364,10.16,10.11,10.13,10.08
50%,10.0,9.95,10.19,10.14,10.15,10.1
75%,10.0136,9.9632,10.23,10.18,10.18,10.13
max,10.0872,10.0388,10.54,10.45,10.37,10.34


In [22]:
results_df

Unnamed: 0,record_male,record_female,best_male,best_female,runner_up_male,runner_up_female
0,10.0004,9.9516,10.21,10.15,10.19,10.14
1,10.0704,9.9492,10.35,10.16,10.23,10.10
2,9.9780,9.9476,10.21,10.11,10.12,10.09
3,10.0264,9.9356,10.22,10.22,10.17,10.18
4,10.0264,9.9472,10.24,10.15,10.22,10.04
5,9.9908,9.9412,10.14,10.09,10.14,10.07
6,10.0028,9.9248,10.17,10.05,10.17,10.04
7,9.9900,9.9284,10.27,10.15,10.15,10.12
8,10.0012,9.9220,10.18,10.09,10.16,10.05
9,10.0284,9.9472,10.24,10.13,10.19,10.11


In [27]:
for i in range (num_simulations):
    if results_df['best_male'][i] > results_df['best_female'][i]:
        results_df['record_male_winner'][i] = results_df['best_male'][i]
    else:
        results_df['record_male_winner'][i] = nan
    
    
    if results_df['best_female'][i] > results_df['best_male'][i]:
        results_df['record_female_winner'][i] = results_df['best_female'][i]
    else:
        results_df['record_female_winner'][i] = nan
    
print (results_df['record_male_winner'], results_df['record_female_winner'])

0        10.21
1        10.35
2        10.21
3          NaN
4        10.24
5        10.14
6        10.17
7        10.27
8        10.18
9        10.24
10       10.19
11         NaN
12         NaN
13         NaN
14       10.24
15       10.15
16       10.19
17       10.18
18       10.20
19         NaN
20       10.18
21         NaN
22       10.19
23       10.22
24       10.28
25       10.28
26         NaN
27       10.17
28       10.23
29       10.16
         ...  
99970      NaN
99971    10.24
99972    10.27
99973    10.16
99974    10.23
99975    10.21
99976      NaN
99977    10.16
99978      NaN
99979      NaN
99980      NaN
99981    10.13
99982    10.20
99983    10.16
99984    10.25
99985    10.12
99986      NaN
99987    10.23
99988    10.20
99989    10.25
99990    10.16
99991      NaN
99992    10.17
99993    10.26
99994    10.26
99995      NaN
99996    10.31
99997      NaN
99998    10.25
99999    10.21
Name: record_male_winner, Length: 100000, dtype: float64 0          NaN
1          Na

In [28]:
np.mean(results_df['record_male_winner'])-np.mean(results_df['record_female_winner'])

0.008353097532463494

This is smaller than 0.05, the original mean difference.

In [29]:
for i in range (num_simulations):
    if results_df['best_male'][i] > results_df['best_female'][i]:
        results_df['record_winner'][i] = results_df['best_male'][i]
        
        if results_df['best_female'][i] > results_df['runner_up_male'][i]:
            results_df['record_runner_up'][i] = results_df['best_female'][i]
        else:
            results_df['record_winner'][i] = nan
            results_df['record_runner_up'][i] = nan
    
    else:
        results_df['record_winner'][i] = - results_df['best_female'][i]
    
        if results_df['best_male'][i] > results_df['runner_up_female'][i]:
            results_df['record_runner_up'][i] = - results_df['best_male'][i]
        else:
            results_df['record_winner'][i] = nan
            results_df['record_runner_up'][i] = nan
    
print (results_df['record_winner'], results_df['record_runner_up'])

0          NaN
1          NaN
2          NaN
3       -10.22
4          NaN
5          NaN
6          NaN
7          NaN
8          NaN
9          NaN
10         NaN
11      -10.19
12      -10.19
13      -10.21
14         NaN
15         NaN
16         NaN
17         NaN
18       10.20
19      -10.24
20         NaN
21      -10.18
22         NaN
23       10.22
24         NaN
25       10.28
26      -10.23
27       10.17
28       10.23
29         NaN
         ...  
99970      NaN
99971      NaN
99972      NaN
99973      NaN
99974    10.23
99975      NaN
99976   -10.22
99977      NaN
99978      NaN
99979   -10.19
99980   -10.22
99981      NaN
99982      NaN
99983      NaN
99984      NaN
99985    10.12
99986   -10.15
99987      NaN
99988      NaN
99989      NaN
99990      NaN
99991      NaN
99992      NaN
99993      NaN
99994      NaN
99995   -10.20
99996      NaN
99997      NaN
99998    10.25
99999      NaN
Name: record_winner, Length: 100000, dtype: float64 0          NaN
1          NaN
2  

In [30]:
np.mean(results_df['record_winner']-results_df['record_runner_up'])

0.005261597240042201