In [3]:
import csv
import pandas as pd
with open('titanic.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df = pd.DataFrame(d[1:], columns = d[0])
df

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29,female,1
1,"Allison, Miss Helen Loraine",1st,2,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1
...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27,male,0
1309,"Zakarian, Mr Maprieder",3rd,26,male,0
1310,"Zenni, Mr Philip",3rd,22,male,0
1311,"Lievens, Mr Rene",3rd,24,male,0


In [20]:
def calc_table(s1, s2):
  v1 = s1.unique()
  v2 = s2.unique()
  return {k1:{k2:((s1 == k1) & (s2 == k2)).sum() for k2 in v2} for k1 in v1}
t = calc_table(df["Survived"], df["PClass"])
t

{'1': {'1st': 193, '2nd': 119, '3rd': 138},
 '0': {'1st': 129, '2nd': 161, '3rd': 573}}

In [59]:
def calc_Q(t):
  n = 0
  ni = {}
  nj = {}
  for k1, v1 in t.items():
    for k2, nij in v1.items():
      n += nij
      if k1 not in ni:
        ni[k1] = 0
      if k2 not in nj:
        nj[k2] = 0
      ni[k1] += nij
      nj[k2] += nij

  q = 0
  for k1, v1 in t.items():
    for k2, nij in v1.items():
      k = n * (ni[k1] / n) * (nj[k2] / n)
      q += ((nij - k) ** 2) / k
  return q

q = calc_Q(t)
q

172.29911827411786

In [34]:
import numpy as np

def flatten_table(t):
  d = np.zeros((len(t), len(t[next(iter(t))])))
  i = 0
  for k1, v1 in t.items():
    j = 0
    for k2, nij in v1.items():
      d[i, j] = nij
      j += 1
    i += 1
  return d

d = flatten_table(t)
d

array([[193., 119., 138.],
       [129., 161., 573.]])

In [37]:
from scipy.stats import chi2_contingency

chi2_contingency(d)

(172.29911827411786,
 3.852315502424536e-38,
 2,
 array([[110.35795887,  95.9634425 , 243.67859863],
        [211.64204113, 184.0365575 , 467.32140137]]))

In [56]:
import scipy.stats

def chi2_test(s1, s2, alpha):
  t = calc_table(s1, s2)
  d = flatten_table(t)
  q = calc_Q(t)
  n, m = d.shape
  print(f"Q = {q}")
  chi = scipy.stats.chi2.isf(alpha, (n-1) * (m-1))
  print(f"chi2 = {chi}")

chi2_test(df["PClass"], df["Survived"], 0.05)

Q = 172.2991182741179
chi2 = 5.991464547107983


In [57]:
chi2_test(df["Sex"], df["Survived"], 0.05)

Q = 332.05702407732946
chi2 = 3.8414588206941285


In [60]:
chi2_test(df["Sex"], df["PClass"], 0.05)

Q = 22.125247699194297
chi2 = 5.991464547107983


In [170]:
d = np.array([2, 2, 3, 3, 5])
d

array([2, 2, 3, 3, 5])

In [171]:
n = d.shape[0]
rmax = n ** n
print(f"rmax = {rmax}")

rmax = 3125


In [172]:
def bootstrap_sample(d, r):
  n = d.shape[0]
  result = np.zeros((r, n))
  indices = np.random.randint(0, n, (r, n))
  for i in range(r):
    for j in range(n):
      result[i, j] = d[indices[i, j]]
  return result
r = 1000
b = bootstrap_sample(d, r)
b

array([[2., 2., 2., 2., 3.],
       [5., 3., 3., 3., 5.],
       [5., 5., 5., 3., 5.],
       ...,
       [2., 3., 3., 2., 2.],
       [3., 5., 5., 3., 3.],
       [3., 5., 3., 2., 5.]])

In [173]:
m = np.median(b, axis=1)
m.shape

(1000,)

In [174]:
values = np.unique(m)
list(values)

[2.0, 3.0, 5.0]

In [175]:
rep = {v:(m == v).sum()/r for v in list(values)}
rep

{2.0: 0.336, 3.0: 0.614, 5.0: 0.05}

In [176]:
rep_flatten = np.array([(m == v).sum()/r for v in list(values)])
rep_flatten

array([0.336, 0.614, 0.05 ])

In [177]:
median_est = 0
for k, v in rep.items():
  median_est += k * v
median_est

2.7640000000000002

In [178]:
median2_est = 0
for k, v in rep.items():
  median2_est += k * k * v
median2_est

8.120000000000001

In [179]:
median_s = np.sqrt(median2_est - median_est ** 2)
median_s

0.6930396814035972

In [182]:
tt = scipy.stats.t.isf(0.05, n-1)
margin = tt * median_s / np.sqrt(n)
print(f"[{median_est - margin}, {median_est + margin}]")

[2.1032622992657988, 3.4247377007342017]


In [183]:
tt

2.1318467813362907