In [57]:
import numpy as np
import pandas as pd

# 1) Difference between marginal and conditional empirical densities

## a) Plot the empirical density (histogram) of the size column and then the conditional densities. What do you see? 

In [58]:
def generate_data(size=1000, seed=10):
    np.random.seed(seed)
    group_prob = np.array([0.4, 0.1, 0.5])
    group_mean = np.array([5, 10, 15])
    
    groups = np.random.choice(np.arange(1, len(group_prob) + 1), p=group_prob, size=[size, 1])
    
    X = group_mean[groups - 1] + np.random.standard_normal([size, 1])


    df = pd.DataFrame(data=np.concatenate([groups, X], axis=1), columns=["group", "size"])
    
    return df
    
    
    
data = generate_data()        

## b) Are the size values correlated between different groups? Why or why not? 

## c) How would you replace the missing values in the following DataFrame? 

In [59]:
def generate_data_nas(size=1000, seed=10):
    np.random.seed(seed)
    group_prob = np.array([0.4, 0.1, 0.5])
    group_mean = np.array([5, 10, 15])
    
    groups = np.random.choice(np.arange(1, len(group_prob) + 1), p=group_prob, size=[size, 1])
    
    X = group_mean[groups - 1] + np.random.standard_normal([size, 1])
    X[np.random.binomial(1, 0.1, size=size).astype(bool), :] = np.nan
    
    df = pd.DataFrame(data=np.concatenate([groups, X], axis=1), columns=["group", "size"])
    
    
    return df
    
df = generate_data_nas()

## 2) Biased vs unbiased estimates of variance (see https://en.wikipedia.org/wiki/Bias_of_an_estimator)

Can you show through a simulation study that 
$$
\hat{\sigma}^2_{biased} = \frac{1}{N} \sum_{i=1}^N (x_i - \hat{\mu}_x)^2, 
$$
with $\hat{\mu}_x = \frac{1}{N} \sum_{i=1}^N x_i$ is a biased estimator of the population variance $\sigma^2=1$ when 
$$
X \sim N(0, \sigma^2)
$$
and $N = 1000$. Then show that 
$$
\hat{\sigma}^2_{unbiased} = \frac{1}{N - 1} \sum_{i=1}^N (x_i - \hat{\mu}_x)^2
$$
is an unbiased estimator of $\sigma^2$.

a) Start with 

In [60]:
N = 1000
np.random.seed(42)
X = np.random.standard_normal(size=N)

and calculate the sample variance with the equation (formula) for the BIASED variance estimator 

b) Repeat the previous step a large number of times (say 10000) while changing the seed every time you generate a new X array. Save each of the calculcated variance estimates in a list (or array)

c) Visualise the variance estimates in a histogram. Is the median of the empirical density around 1? 

d) Repeat the previous steps with the UNBIASED variance estimator