# 04 — Joint & conditional distributions, covariance, correlation

We’ll create two related random variables and verify:
- Conditional distribution ideas
- Cov(X,Y) = E[XY] − E[X]E[Y]
- Var(X+Y) = Var(X) + Var(Y) + 2Cov(X,Y)
- ρ(X,Y) = Cov(X,Y)/(SD(X)SD(Y))


In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# Reproducibility: you can change this seed
rng = np.random.default_rng(42)


## Simulate a simple bivariate model


In [None]:
n = 300000
X = rng.normal(0, 1, size=n)
Y = 2.0*X + rng.normal(0, 1, size=n)  # correlated with X


## Covariance identities


In [None]:
EX = X.mean()
EY = Y.mean()
EXY = (X*Y).mean()

cov1 = np.cov(X, Y, ddof=0)[0,1]
cov2 = EXY - EX*EY

print("Cov via np.cov :", cov1)
print("E[XY]-E[X]E[Y] :", cov2)


## Correlation coefficient ρ(X,Y)


In [None]:
sdX = X.std()
sdY = Y.std()
rho = cov1 / (sdX*sdY)

print("SD(X), SD(Y):", sdX, sdY)
print("rho:", rho)
print("rho must be between -1 and 1:", -1 <= rho <= 1)


## Var(X+Y) formula


In [None]:
lhs = np.var(X+Y)
rhs = np.var(X) + np.var(Y) + 2*np.cov(X, Y, ddof=0)[0,1]
print("Var(X+Y):", lhs)
print("RHS     :", rhs)


## Correlation under linear transformations


In [None]:
a, b = 5.0, -3.0
c, d = -2.0, 4.0

X2 = b*X + a
Y2 = d*Y + c

rho2 = np.corrcoef(X2, Y2)[0,1]
rho_xy = np.corrcoef(X, Y)[0,1]

print("rho(X,Y):", rho_xy)
print("rho(bX+a, dY+c):", rho2)
print("sign(b*d) =", "positive" if b*d>0 else "negative")


## Visual: scatter plot (subsampled)


In [None]:
idx = rng.choice(n, size=4000, replace=False)
plt.figure()
plt.scatter(X[idx], Y[idx], s=8)
plt.title("Scatter of (X,Y) (subsampled)")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
