In [1]:
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (9,4)
%matplotlib widget

# 1) Usage of random seeds

Try to understand the output of the following two cells. What happens if you assign a different integer value to the seed variable?


In [2]:
seed = 42
np.random.seed(seed)
print(np.random.standard_normal(1))

[0.49671415]


In [3]:
print(np.random.standard_normal(1))

[-0.1382643]


# 2) Expectation and variance

The following dictionary describes a (discrete) probability function. 

In [4]:
lottery_outcomes = {
        1000:0.2,
        500:0.4,
        250:0.1,
        150:0.05,
        75:0.2,
        -1000:0.05
    }

## a) Calculate the expectation and variance of the distribution

In [18]:
expected_value = []
for val, prob in lottery_outcomes.items():
    expected_value.append(val * prob)
expected_value = np.sum(expected_value)

variance = []
for val, prob in lottery_outcomes.items():
    variance.append(prob * (val - expected_value) ** 2)
variance = np.sum(variance)
print(expected_value, variance)

397.5 200493.75


In [5]:
mu = np.sum([prob*val for val, prob in lottery_outcomes.items()])
var = np.sum([prob*(val-mu)**2 for val, prob in lottery_outcomes.items()])

print(round(var,2), round(var ** 0.5) ,mu )

200493.75 448 397.5


## b) Use the numpy.random.choice function to draw samples from this distribution

In [44]:
vals = np.array(list(lottery_outcomes.keys()))
probs = np.array(list(lottery_outcomes.values()))


np.random.seed(41)
X = np.random.choice(vals, p=probs, size=10000)
# print(X)
print(np.var(X), np.mean(X))

196575.6544 401.16


# 3) Correlation

## a) Use the following function to simulate data as
$$
Y = X^k + \epsilon,  \quad \epsilon \sim N(0, 1),  X \sim N(0, 100), 
$$
where k is either 1, 2 or 3. Then calculate the linear correlation between Y and X with the numpy.corrcoef(). How does the correlation coefficient change with the choice of the parameter k

In [56]:
fig, ax = plt.subplots(nrows=2)
N = 10000
ax[0].hist(np.random.normal(0, 1, N))
ax[1].hist(np.random.normal(0, 100, N))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

(array([  18.,   79.,  468., 1481., 2630., 2820., 1710.,  631.,  147.,
          16.]),
 array([-382.74976577, -307.82521157, -232.90065737, -157.97610317,
         -83.05154897,   -8.12699477,   66.79755942,  141.72211362,
         216.64666782,  291.57122202,  366.49577622]),
 <BarContainer object of 10 artists>)

In [57]:
def generate_data(k, size=10000):
    X = 10 * np.random.standard_normal(size).reshape(1, -1)
    Y =  X ** k + 5 * np.random.standard_normal(size).reshape(1, -1)
    
    return X, Y

corrs = [np.corrcoef(*generate_data(k))[0, 1] for k in range(1, 4)]
print(corrs)

[0.8979706920270677, -0.0335509993267708, 0.7765282985584152]


In [58]:
for k in [1, 2, 3]:
    X, Y = generate_data(k)
    corr = np.corrcoef(X, Y)[0, 1]
    print(k, corr)

1 0.8896115708443607
2 0.006758437799195776
3 0.7859932354502629


## b) Visualise the input (X) and output (Y) of the previously defined equation for k=1, 2 and 3. Try to formulate in words what you see and link it to the calculated correlation coefficients.

In [59]:
k_range = list(range(1, 4))
fig, ax = plt.subplots(nrows=len(k_range), sharex=True)
_ = [ax[i].scatter(*generate_data(k)) for i,k in enumerate(k_range)]
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …