# Setup 

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# independence test implementation 
def find_value_smaller_than_5(table):
    for i in range(len(table)):
        for j in range(len(table[0])):
            if table[i][j]<=5:
                return True, j
    return False, 0

def collapse_index(table):
    exist, collapse_ind = find_value_smaller_than_5(table)
    print_once = True
    while exist: 
#         if print_once:
#             print("table has value smaller than 5")
#             print_once = False
        new_table = []
        if collapse_ind<len(table[0])-1:
            move_to = collapse_ind
        else:
            move_to = collapse_ind - 1
                
        for row in table: 
            new_row = row[:]
            val = new_row.pop(collapse_ind)
            new_row[move_to] += val
            new_table.append(new_row)
        table = new_table[:]
        exist, collapse_ind = find_value_smaller_than_5(table)
    
    return table

def chi_square_analysis(param, treatment):
    """
    for descrete property and binary treatment 
    """
    treatment_options = [[] for k in treatment.dropna().unique()]
    frame = pd.DataFrame({'treatment': treatment, 'property':param})
    frame = frame.dropna().copy()
    

    for val in frame["property"].unique(): 
        for i in range(len(treatment_options)):
            treatment_options[i].append(((frame["property"]==val) & (frame["treatment"]==i)).sum())
    
    # print(treatment_options)
    treatment_options = collapse_index(treatment_options)
    # print(treatment_options)
    
    obs = np.array(treatment_options)
    return chi2_contingency(obs)[:2]

## Load Country Data
To see results for specific country, uncomment the desired country and go to Kernel--> Restart and run all. 

In [2]:
df = pd.read_csv("PISA2012_data.csv")


# Case 1: Canada's student population, treatment is defined as having any pre-primary education

# country = df[(df["CNT"]=="CAN")].copy()
# country["Treated"] = country["ST05Q01"] >1

# Case 2: Canada's students who attained pre-primary education to some degree, 
# treatment is defined as attaining pre-primary education for more than a year

country = df[(df["CNT"]=="CAN") & (df["ST05Q01"]>1)].copy()
country["Treated"] = country["ST05Q01"] == 3

# Gender 
The variable ST004D01T represens student gender. The value 1 represents a female while 2 represents a male. 

In [3]:
def gender_dependency_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country["ST04Q01"], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    gender = country.groupby(["ST04Q01"])[treatment].agg(['sum', 'mean']).rename(index={1:"Female", 2: "Male"})
    return gender.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                'mean': '{} share'.format(treatment),
                                                'ST04Q01':'gender'})

In [4]:
gender_dependency_test(country, "Treated")

Chi square: 9.50073810133194, p-value:0.002053892601660252


Unnamed: 0,gender,Treated No.,Treated share
0,Female,2113,0.491967
1,Male,1798,0.457739


# Year of birth 
This covariant is irrelevant for the Canada sample, as all students in this sample were born in 1996.

In [5]:
def birth_year_dependency_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country["ST03Q02"], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    year = country.groupby(["ST03Q02"])[treatment].agg(['sum', 'mean'])
    return year.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                'mean': '{} share'.format(treatment),
                                                'ST03Q02':'Year of birth'})

In [6]:
birth_year_dependency_test(country, "Treated")

Chi square: 0.0, p-value:1.0


Unnamed: 0,Year of birth,Treated No.,Treated share
0,1996,3911,0.475617


# Month of birth

In [7]:
def birth_month_dependency_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country["ST03Q01"], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    month = country.groupby(["ST03Q01"])[treatment].agg(['sum', 'mean'])
    return month.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                'mean': '{} share'.format(treatment),
                                                'ST03Q01':'Month of birth'})

In [8]:
birth_month_dependency_test(country, 'Treated')

Chi square: 17.380716301973308, p-value:0.09710766349930443


Unnamed: 0,Month of birth,Treated No.,Treated share
0,1,317,0.454155
1,2,327,0.496206
2,3,328,0.453665
3,4,378,0.514986
4,5,402,0.501873
5,6,332,0.456671
6,7,328,0.445652
7,8,372,0.497992
8,9,291,0.454688
9,10,293,0.479542


# Immigration Status

In [9]:
def immigration_dependency_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country['IMMIG'], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    immig = country.groupby(["IMMIG"])[treatment].agg(['sum', 'mean']).rename(index={1:"Native", 
                                                                                  2:"Second Generation",
                                                                                  3: "First Generation"})
    return immig.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                'mean': '{} share'.format(treatment)})

In [10]:
immigration_dependency_test(country, "Treated")

Chi square: 45.94446856296111, p-value:1.0550800580535493e-10


Unnamed: 0,IMMIG,Treated No.,Treated share
0,Native,3150,0.461674
1,Second Generation,384,0.497409
2,First Generation,377,0.600318


# Students International Grade

In [11]:
def grade_dependency_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country['ST01Q01'], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    grade = country.groupby(['ST01Q01'])[treatment].agg(['sum', 'mean'])
    return grade.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                'mean': '{} share'.format(treatment),
                                                'ST01Q01':"Grade"})

In [12]:
grade_dependency_test(country, "Treated")

Chi square: 10.019270634523057, p-value:0.018403036258215953


Unnamed: 0,Grade,Treated No.,Treated share
0,7.0,1,0.5
1,8.0,9,0.333333
2,9.0,417,0.465402
3,10.0,3447,0.479349
4,11.0,37,0.349057
5,12.0,0,0.0


# Mother Education (ISCED level)

In [13]:
def education_independence_test(country, covariate, treatment):
    chi_square, pi_val = chi_square_analysis(country[covariate], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    edu= country.groupby([covariate])[treatment].agg(['sum', 'mean']).rename(index={0:'None',1: 'ISCED 1',
                                                                                   2: 'ISCED 2', 3: 'ISCED 3B,C',
                                                                                   4: 'ISCED 3A, ISCED 4', 
                                                                                   5: 'ISCED 5B', 
                                                                                   6: 'ISCED 5A, 6'})
    return edu.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                             'mean': '{} share'.format(treatment)})

In [14]:
education_independence_test(country, 'misced', "Treated")

Chi square: 111.9859869903522, p-value:1.5584675127712518e-22


Unnamed: 0,misced,Treated No.,Treated share
0,,10,0.434783
1,ISCED 1,14,0.341463
2,ISCED 2,104,0.409449
3,"ISCED 3A, ISCED 4",1015,0.415813
4,ISCED 5B,887,0.444612
5,"ISCED 5A, 6",1881,0.542231


# Father Education (ISCED level)

In [15]:
education_independence_test(country, 'fisced', "Treated")

Chi square: 126.15164904168567, p-value:1.5594161881370471e-25


Unnamed: 0,fisced,Treated No.,Treated share
0,,12,0.428571
1,ISCED 1,39,0.453488
2,ISCED 2,209,0.374552
3,"ISCED 3A, ISCED 4",1258,0.423854
4,ISCED 5B,722,0.464607
5,"ISCED 5A, 6",1671,0.551667


# Language used at home
This measure is based on the question "What language do you speak at home most of the time?". Possible values:
1. Language of test
2. Other language.

In [16]:
def language_independence_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country['ST25Q01'], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    language = country.groupby(['ST25Q01'])[treatment].agg(['sum', 'mean']).rename(index={1:"Language of test", 
                                                                                    2:"other language"})
    return language.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                               'mean': '{} share'.format(treatment),
                                                 'ST25Q01': 'Language used at home'})

language_independence_test(country, 'Treated')

Chi square: 35.882262795988595, p-value:2.0960823233585424e-09


Unnamed: 0,Language used at home,Treated No.,Treated share
0,Language of test,3213,0.461505
1,other language,698,0.553529


# Socio-economic index (ESCS)

In [17]:
def escs_independence_test(country, treatment):
    country["ESCS_quartiles"] = pd.qcut(country["ESCS"], 4, labels=['Q1','Q2','Q3', 'Q4'])
    chi_square, pi_val = chi_square_analysis(country['ESCS_quartiles'], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    escs = country.groupby(['ESCS_quartiles'])[treatment].agg(['sum', 'mean'])
    return escs.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                               'mean': '{} share'.format(treatment)})

escs_independence_test(country, 'Treated')

Chi square: 200.14222479711998, p-value:3.930351630203476e-43


Unnamed: 0,ESCS_quartiles,Treated No.,Treated share
0,Q1,775,0.376031
1,Q2,928,0.446798
2,Q3,1027,0.490918
3,Q4,1181,0.592574


# Primary start age

In [18]:
def primary_start_independence_test(country, treatment):
    chi_square, pi_val = chi_square_analysis(country['ST06Q01'], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    start_age = country.groupby(['ST06Q01'])[treatment].agg(['sum', 'mean'])
    return start_age.reset_index().rename(columns={'sum':'{} No.'.format(treatment),
                                                   'mean': '{} share'.format(treatment), 
                                                   'ST06Q01':'Primary start age'})
    

In [19]:
primary_start_independence_test(country, "Treated")

Chi square: 83.39190118819967, p-value:7.106574132109099e-16


Unnamed: 0,Primary start age,Treated No.,Treated share
0,4.0,735,0.574668
1,5.0,2009,0.436644
2,6.0,1034,0.496877
3,7.0,97,0.5
4,8.0,12,0.545455
5,9.0,8,0.615385
6,10.0,6,0.428571
7,11.0,0,0.0
8,12.0,6,0.666667
9,13.0,3,0.5


## Occupation Classification - Mother
Occupation is classified by the International Standard Classification of Occupation (https://www.ilo.org/public/english/bureau/stat/isco/). <br> We devided the occupation codes to the following categories:
1. Armed forces occupations & managers
2. Professionals
3. Technicians and associate professionals
4. Clerical support workers
5. Service and sales workers
6. Skilled agricultural, forestry and fishery workers
7. Crafts and related trades workers
8. Pland and machine operators, and assemblers
9. Elementary occupations
10. Housewife and social beneficiary 

In [20]:
def occupation_dependency_test(country, covariate, treatment):
    chi_square, pi_val = chi_square_analysis(country[covariate], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    occupation = country.groupby([covariate])[treatment].agg(['sum', 'mean'])
    occupation = occupation.rename(index={'A': 'Armed forces occupations & Managers',
                                          'B': 'Professionals',
                                          'C': 'Technicians and associate professionals',
                                          'D': 'Clerical support workers',
                                          'E': 'Service and sales workers',
                                          'F': 'Skilled agricultural, forestry and fishery workers',
                                          'G': 'Craft and related trades workers',
                                          'H': 'Plant and machine operators, and assemblers',
                                          'I': 'Elementary occupations',
                                          'J': 'Housewife and social beneficiary'})
    if "1" in covariate:
        return occupation.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                        'mean': '{} share'.format(treatment),
                                                        'OCOD1': 'Occupation Classification - Mother'})
    return occupation.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                    'mean': '{} share'.format(treatment),
                                                    'OCOD2': 'Occupation Classification - Father'})

In [21]:
occupation_dependency_test(country, 'OCOD1', 'Treated')

Chi square: 92.82077045119273, p-value:1.242459923451628e-16


Unnamed: 0,Occupation Classification - Mother,Treated No.,Treated share
0,Armed forces occupations & Managers,414,0.507975
1,Professionals,1395,0.530015
2,Technicians and associate professionals,619,0.4952
3,Clerical support workers,468,0.457031
4,Service and sales workers,671,0.411656
5,"Skilled agricultural, forestry and fishery wor...",27,0.36
6,Craft and related trades workers,61,0.376543
7,"Plant and machine operators, and assemblers",52,0.33121
8,Elementary occupations,204,0.426778


## Occupation Classification - Father
See details in the previous section. 

In [22]:
occupation_dependency_test(country, 'OCOD2', 'Treated')

Chi square: 98.69710999928591, p-value:7.879979940713415e-18


Unnamed: 0,Occupation Classification - Father,Treated No.,Treated share
0,Armed forces occupations & Managers,623,0.502419
1,Professionals,956,0.563347
2,Technicians and associate professionals,467,0.484943
3,Clerical support workers,89,0.491713
4,Service and sales workers,281,0.462932
5,"Skilled agricultural, forestry and fishery wor...",202,0.444934
6,Craft and related trades workers,713,0.421893
7,"Plant and machine operators, and assemblers",368,0.429405
8,Elementary occupations,212,0.397004


## ISEI of mother (International Socio-Economic Index of Occupational Status)  

In [23]:
def parent_isei_dependency_test(country, covariate, treatment):
    quartiles = pd.qcut(country[covariate], 4, labels=['Q1','Q2','Q3', 'Q4'])
    chi_square, pi_val = chi_square_analysis(quartiles, country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    isei = country.groupby([quartiles])[treatment].agg(['sum', 'mean'])
    if '1' in covariate:
        return isei.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                  'mean': '{} share'.format(treatment), 
                                                  'BMMJ1':'ISEI - Mother'})
    return isei.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                  'mean': '{} share'.format(treatment), 
                                                  'BFMJ2':'ISEI - Father'})

In [24]:
# Treatment 1 Analysis
parent_isei_dependency_test(country, 'BMMJ1', 'Treated')

Chi square: 107.34801420766314, p-value:4.0832179315236263e-23


Unnamed: 0,ISEI - Mother,Treated No.,Treated share
0,Q1,864,0.401114
1,Q2,901,0.451403
2,Q3,1008,0.499752
3,Q4,1138,0.553502


# ISEI of Father (International Socio-Economic Index of Occupational Status) 

In [25]:
# Treatment 1 Analysis
parent_isei_dependency_test(country, 'BFMJ2', 'Treated')

Chi square: 101.03142934709565, p-value:9.326285632747558e-22


Unnamed: 0,ISEI - Father,Treated No.,Treated share
0,Q1,936,0.415631
1,Q2,819,0.440323
2,Q3,1012,0.489835
3,Q4,1144,0.559413


# Wealth Index

In [26]:
def wealth_index_dependency_test(country, treatment):
    quartiles = pd.qcut(country["WEALTH"], 4, labels=['Q1','Q2','Q3', 'Q4'])
    chi_square, pi_val = chi_square_analysis(quartiles, country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    wealth = country.groupby([quartiles])[treatment].agg(['sum', 'mean'])
    return wealth.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                                  'mean': '{} share'.format(treatment), 
                                                  'WEALTH':'Wealth Index'})

In [27]:
wealth_index_dependency_test(country, "Treated")

Chi square: 36.587310736091396, p-value:5.6258674472295814e-08


Unnamed: 0,Wealth Index,Treated No.,Treated share
0,Q1,941,0.432445
1,Q2,1016,0.464989
2,Q3,1276,0.493235
3,Q4,678,0.531765


# Family Stucture Indicators
PISA2012 asked students whether they have a mother, father and brothers at home. 

In [28]:
def family_structure_dependency_test(country, covariate, treatment):
    chi_square, pi_val = chi_square_analysis(country[covariate], country[treatment])
    print("Chi square: {}, p-value:{}".format(chi_square, pi_val))
    family= country.groupby([covariate])[treatment].agg(['sum', 'mean']).rename(index={1:'Yes',2: 'No'})
    family =  family.reset_index().rename(columns={'sum':'{} No.'.format(treatment), 
                                             'mean': '{} share'.format(treatment)})
    if '01' in covariate:
        return family.rename(columns={'ST11Q01': 'At home - mother'})
    if '02' in covariate:
        return family.rename(columns={'ST11Q02': 'At home - father'})
    if '03' in covariate:
        return family.rename(columns={'ST11Q03': 'At home - brothers'})
    if '04' in covariate:
        return family.rename(columns={'ST11Q04': 'At home - sisters'})
    else: 
        return family.rename(columns={'ST11Q04': 'At home - siblings'})

In [29]:
family_structure_dependency_test(country, 'ST11Q01', 'Treated')

Chi square: 4.902816336299917, p-value:0.026812932522397553


Unnamed: 0,At home - mother,Treated No.,Treated share
0,Yes,3827,0.477659
1,No,84,0.398104


In [30]:
family_structure_dependency_test(country, 'ST11Q02', 'Treated')

Chi square: 0.010636631825355486, p-value:0.9178565828206378


Unnamed: 0,At home - father,Treated No.,Treated share
0,Yes,3521,0.475361
1,No,390,0.477941


In [34]:
country['siblings']=((country['ST11Q03']==1) | (country['ST11Q04'] ==1)).astype(int).replace(0, 2)
family_structure_dependency_test(country, 'siblings', 'Treated')

Chi square: 1.019442137545017, p-value:0.3126513722365508


Unnamed: 0,siblings,Treated No.,Treated share
0,Yes,3300,0.478122
1,No,611,0.462528
