In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from matplotlib import cm
from sklearn.feature_selection import chi2
from scipy import stats

# Chi -squared test
Pearson's chi-squared test is used to determine whether there is a
statistically significant difference between the expected frequencies and
the observed frequencies in one or more categories of a contingency table.

$ {\displaystyle \chi ^{2}=\sum _{i=1}^{n}{\frac {(O_{i}-E_{i})^{2}}{E_{i}}}=N\sum _{i=1}^{n}{\frac {\left(O_{i}/N-p_{i}\right)^{2}}{p_{i}}}}$

$\chi ^{2} $ - Pearson's cumulative test statistic, which asymptotically
approaches a $ \chi ^{2} $ distribution.
${\displaystyle O_{i}}O_{i} $= the number of observations of type i.

$ N $= total number of observations

${\displaystyle E_{i}=Np_{i}}E_{i} $= the expected (theoretical) count
of type i, asserted by the null hypothesis that the fraction of type i in the
 population is ${\displaystyle p_{i}}p_{i}$

$ n $ the number of cells in the table.

Degrees of freedom:
${\displaystyle ({\text{number of rows}}-1)({\text{number of columns}}-1)}$

# Expected:
$E_{ij} = {\frac{O_{i.}O_{.j}} {O_{..}}}$
where:
* Oi. - marginal total of row i
* O.j - marginal total of column j
* O.. - grand total (sum of totals of rows or sum of totals of columns - the
same)

# 1. Prepare data

In [117]:
df = pd.read_csv('../Data/test_Feb22.csv', index_col=0)
train = df[df['target'].isna() == False].copy()
train.shape

(2019, 12)

# 1. Chi-squared
* Generate a contingency table

In [118]:
def contingency_table(train, feature):
    contingency = train[[feature, 'target']].pivot_table(aggfunc='size',
       index='target', columns=[feature])
    return contingency

### Checking p-values

In [119]:
features = ['feature1', 'feature2']
for feature in features:
    # Create contingency table
    cont = contingency_table(train, feature)

    # Check p-value
    chisq, pvalue, df, expected = \
        stats.chi2_contingency(cont, correction=False)
    print(chisq)
    print(f'P-value of the {feature}: {pvalue:.5f}')

292.25294990134813
P-value of the feature1: 0.00000
3.3029644986317175
P-value of the feature2: 0.06915


### Sklearn has a different implementation of chi-2, but the same result:

In [120]:
from sklearn.feature_selection import chi2
chi2_values, p_values = chi2(train[features], train['target'])
for i, feature in enumerate(features):
    print(f'P-value of the {feature}: {p_values[i]:.5f}')

P-value of the feature1: 0.00000
P-value of the feature2: 0.19469


## Insight:
* Feature 1 - low p-value, good feature.
* Feature 2 - p-value is over 0.05 standard, may be removed.

Check chi-2 by hand
1. Generate contingency table
2. Find marginal totals
3. Find expected values
4. Find chi2 (observed-expected)^2/expected

1. Generate contingency table

In [121]:
crosstab = pd.crosstab(columns=train['target'], index=train['feature1'])
crosstab

target,0.0,1.0
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,562,218
1,409,830


3. Find marginal totals

In [130]:
crosstab_marg = crosstab.copy()
crosstab_marg['MarginalTotals'] = pd.Series([crosstab[0].sum(), crosstab[1].sum()])
crosstab_marg.loc['MarginalTotals'] = \
    pd.Series([crosstab.loc[0].sum(), crosstab.loc[1].sum()])
crosstab_marg.loc['MarginalTotals', 'MarginalTotals'] = crosstab.sum().sum()
crosstab_marg

target,0.0,1.0,MarginalTotals
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,562.0,218.0,971.0
1,409.0,830.0,1048.0
MarginalTotals,780.0,1239.0,2019.0


4. Find expected values

In [124]:
total_sum = crosstab.sum().sum()
t_0_f_0_ex = crosstab[0].sum() * crosstab.loc[0].sum() / total_sum
t_1_f_0_ex = crosstab[1].sum() * crosstab.loc[0].sum() / total_sum
t_0_f_1_ex = crosstab[0].sum() * crosstab.loc[1].sum() / total_sum
t_1_t_1_ex = crosstab[1].sum() * crosstab.loc[1].sum() / total_sum
expected_list = [t_0_f_0_ex, t_1_f_0_ex, t_0_f_1_ex, t_1_t_1_ex]
observed_list = [crosstab.loc[0, 0], crosstab.loc[0, 1], crosstab.loc[1, 0],
                 crosstab.loc[1, 1]]
print(t_0_f_0_ex, t_1_f_0_ex, t_0_f_1_ex, t_1_t_1_ex)

375.1263001485884 404.8736998514116 595.8736998514116 643.1263001485884


In [125]:
def chi2_my(observed, expected):
    return np.square(observed - expected) / expected

In [126]:
chi2_value = 0
for i in range(0,4):
    chi2_value += chi2_my(observed_list[i], expected_list[i])
chi2_value

292.25294990134813

In [127]:
from scipy import stats
print(f'P value: {stats.chi2.sf(chi2_value, df=1):.10f}')

P value: 0.0000000000
