## Poststratification weighing and sampling

### 1. Import libraries and dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [3]:
train= pd.read_csv('Train_v2.csv')

In [4]:
# Creation of function to convert the age (continuous variable) to categorical variable.
def age_cat(age):
    if age >= 15 and age <=65:
        return "15-65"
    else:
        return "above 65"
train['age_cat'] = train['age_of_respondent'].apply(age_cat)

### 2. Loading in the actual popoulation

In [5]:
Kenya= {'Country': 'Kenya', 'year': '2018', 'Population': 51390000,'Male_percent': 49.684, 'Female_percent': 50.316, 'Median_Age': 19.1, 'Pop_den':90, 'Urban_pop': 26.8, '0-14':39.78, '15-64': 57.88, 'above_65': 2.34}
Rwanda= {'Country': 'Rwanda','year': '2016', 'Population': 11668827,'Male_percent': 49.117, 'Female_percent': 50.883, 'Median_Age': 19.6, 'Pop_den':525, 'Urban_pop': 17.4, '0-14':40.95, '15-64': 56.91, 'above_65': 2.77}
Uganda = {'Country': 'Uganda','year': '2018', 'Population': 42720000,'Male_percent': 49.23, 'Female_percent': 50.78, 'Median_Age': 16.1, 'Pop_den':117, 'Urban_pop': 24.6, '0-14':46.93, '15-64': 51.13, 'above_65': 1.94}
Tanzania = {'Country': 'Tanzania','year': '2017', 'Population': 54660000,'Male_percent': 49.936, 'Female_percent': 50.064, 'Median_Age': 17.6, 'Pop_den':62, 'Urban_pop': 34.4, '0-14':44.32, '15-64': 53.1, 'above_65': 2.58}

In [6]:
# creating dataframe for all values
Dem=pd.DataFrame([Kenya,Rwanda,Uganda,Tanzania])

In [7]:
#visualize
Dem

Unnamed: 0,Country,year,Population,Male_percent,Female_percent,Median_Age,Pop_den,Urban_pop,0-14,15-64,above_65
0,Kenya,2018,51390000,49.684,50.316,19.1,90,26.8,39.78,57.88,2.34
1,Rwanda,2016,11668827,49.117,50.883,19.6,525,17.4,40.95,56.91,2.77
2,Uganda,2018,42720000,49.23,50.78,16.1,117,24.6,46.93,51.13,1.94
3,Tanzania,2017,54660000,49.936,50.064,17.6,62,34.4,44.32,53.1,2.58


In [8]:
# converting from percentage to actual numbers
Dem['Acc_owner']= [0.81686*51390000, 0.5002*11668827, 0.68516*42720000, 0.46752*54660000]
Dem['Male_percent']= Dem['Population']*(Dem['Male_percent']/100)
Dem['Female_percent']= Dem['Population']*(Dem['Female_percent']/100)
Dem['0-14']= Dem['Population']*(Dem['0-14']/100)
Dem['15-64']= Dem['Population']*(Dem['15-64']/100)
Dem['above_65']= Dem['Population']*(Dem['above_65']/100)
Dem['Adult_Population']=Dem['0-14'] + Dem['15-64']

## Poststratification Weighing

In this section, we can weigh each datapoint based on their expected porportion and train a classifier based on these weights

In [9]:
W=[]
# Creation of list to assign weights for all data point
for i in range (len(train)):
    if train.loc[i].country=='Rwanda':
        if train.loc[i].gender_of_respondent=='Female':
            if train.loc[i].bank_account=='Yes':
                P=0.45
                W.append(P)
            else: 
                P= 1-0.45
                W.append(P)
        else:
            if train.loc[i].bank_account=='Yes':
                P=0.557
                W.append(P)
            else: 
                P= 1-0.557
                W.append(P)
    elif train.loc[i].country=='Uganda':
        if train.loc[i].gender_of_respondent=='Female':
            if train.loc[i].bank_account=='Yes':
                P=0.527
                W.append(P)
            else: 
                P= 1-0.527
                W.append(P)
        else:
            if train.loc[i].bank_account=='Yes':
                P=0.661
                W.append(P)
            else: 
                P= 1-0.661
                W.append(P)
    elif train.loc[i].country=='Tanzania':
        if train.loc[i].gender_of_respondent=='Female':
            if train.loc[i].bank_account=='Yes':
                P=0.42
                W.append(P)
            else: 
                P= 1-0.42
                W.append(P)
        else:
            if train.loc[i].bank_account=='Yes':
                P=0.5155
                W.append(P)
            else: 
                P= 1-0.5155
                W.append(P)
    elif train.loc[i].country=='Kenya':
        if train.loc[i].gender_of_respondent=='Female':
            if train.loc[i].bank_account=='Yes':
                P=0.777
                W.append(P)
            else: 
                P= 1-0.777
                W.append(P)
        else:
            if train.loc[i].bank_account=='Yes':
                P=0.858
                W.append(P)
            else: 
                P= 1-0.661
                W.append(P)

In [10]:
train['Postweight']=W

In [11]:
train

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,age_cat,Postweight
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed,15-65,0.777
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent,above 65,0.223
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed,15-65,0.858
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private,15-65,0.223
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed,15-65,0.339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,2018,uniqueid_2113,No,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income,15-65,0.473
23520,Uganda,2018,uniqueid_2114,No,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income,15-65,0.473
23521,Uganda,2018,uniqueid_2115,No,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income,15-65,0.473
23522,Uganda,2018,uniqueid_2116,No,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed,15-65,0.473


## Poststratification Sampling

### 3. Assigning weights to obtain sample frequency

In [12]:
# Gathering all details
PK={'Name':'Kenya','Exp':0.32,'Obs': 0.258,'Freq':6068}
PR={'Name':'Rwanda','Exp':0.073,'Obs': 0.371,'Freq':8735}
PU={'Name':'Uganda','Exp':0.267,'Obs': 0.089,'Freq':2101}
PT={'Name':'Tanzania','Exp':0.34,'Obs': 0.281,'Freq':6620}

In [13]:
frames=[PK,PR,PU,PT]

In [14]:
Table=pd.DataFrame(frames)

Calculating weight and expected frequency 

In [15]:
Table['Weight']=Table['Exp']/Table['Obs']
Table['TFreq']=Table['Weight']*Table['Freq']
Table['AFreq']= round((Table['TFreq']/sum(Table['TFreq']))*len(train))

In [16]:
# visualize
Table

Unnamed: 0,Name,Exp,Obs,Freq,Weight,TFreq,AFreq
0,Kenya,0.32,0.258,6068,1.24031,7526.20155,7515.0
1,Rwanda,0.073,0.371,8735,0.196765,1718.746631,1716.0
2,Uganda,0.267,0.089,2101,3.0,6303.0,6294.0
3,Tanzania,0.34,0.281,6620,1.209964,8009.964413,7998.0


### obtaining maximum sample from data

In [17]:
#convertion of the columns to numpy array for array comparism
f=np.array(Table['AFreq'])
h=np.array(Table['Freq'])
#geting the ratio of expected samples
g=np.round(f/min(f))

In [30]:
### The function below extracts the maximum sample that can be obtained from any data without duplication
# and absense of representation bias
#Input: ratio of expected frequency (g), and frequency (h)
#Output: maximum sample size (g)
def get_max(g,h):
    for i in range(int(min(h))):
        for (a,b) in zip(g,h):
            if a>b:
                print ('Maximum Sample size that can be obtained is', sum(g), 'from', len(train))
                return g
                break

        g+=1
    print ('Maximum Sample size that can be obtained is', sum(g), 'from', len(train))
    return g

In [19]:
Table['SFreq']=get_max(g,h)

Maximum Sample size that can be obtained is 8406.0 from 23524


In [20]:
Table['new']=Table['SFreq']/sum(Table['SFreq'])

In [21]:
#visualize
Table

Unnamed: 0,Name,Exp,Obs,Freq,Weight,TFreq,AFreq,SFreq,new
0,Kenya,0.32,0.258,6068,1.24031,7526.20155,7515.0,2102.0,0.250059
1,Rwanda,0.073,0.371,8735,0.196765,1718.746631,1716.0,2099.0,0.249703
2,Uganda,0.267,0.089,2101,3.0,6303.0,6294.0,2102.0,0.250059
3,Tanzania,0.34,0.281,6620,1.209964,8009.964413,7998.0,2103.0,0.250178


### 4. Select sample based on the actual frequency of female and ages

In [22]:
K=train.loc[train['country'] == 'Kenya']
R=train.loc[train['country'] == 'Rwanda']
U=train.loc[train['country'] == 'Uganda']
T=train.loc[train['country'] == 'Tanzania']

In [23]:
MS1=K.loc[K['gender_of_respondent'] == 'Male']
FS1=K.loc[K['gender_of_respondent'] == 'Female']
MS2=R.loc[R['gender_of_respondent'] == 'Male']
FS2=R.loc[R['gender_of_respondent'] == 'Female']
MS3=U.loc[U['gender_of_respondent'] == 'Male']
FS3=U.loc[U['gender_of_respondent'] == 'Female']
MS4=T.loc[T['gender_of_respondent'] == 'Male']
FS4=T.loc[T['gender_of_respondent'] == 'Female']

In [24]:
# selecting sample by statifing based on actual proportion
S1=pd.concat([FS1.iloc[0:(int(round(0.503*2101)))],MS1.iloc[0:(int(round(0.497*2101)))]])
S2=pd.concat([FS2.iloc[0:(int(round(0.51*526)))],MS2.iloc[0:(int(round(0.49*526)))]])
S3=pd.concat([FS3.iloc[0:(int(round(0.508*2101)))],MS3.iloc[0:(int(round(0.492*2101)))]])
S4=pd.concat([FS4.iloc[0:(int(round(0.50*2626)))],MS4.iloc[0:(int(round(0.50*2626)))]])

In [25]:
Sample=pd.concat([S1,S2,S3,S4])

### 5. Results

In [27]:
pd.value_counts(Sample['country'].values, sort=True)

Tanzania    2626
Kenya       2101
Uganda      1783
Rwanda       526
dtype: int64

In [28]:
pd.value_counts(Sample['age_cat'].values, sort=True)

15-65       6492
above 65     544
dtype: int64

In [29]:
pd.value_counts(Sample['gender_of_respondent'].values, sort=True)

Female    3705
Male      3331
dtype: int64

In [60]:
gen=[{'attr': 'Female', 'afD': 0.527, 'bfD': 0.59, 'exp': 0.504 },{'attr': 'Male', 'afD': 0.473, 'bfD': 0.41, 'exp': 0.496 }]
age=[{'attr': '15-65', 'afD': 0.923, 'bfD': 0.916, 'exp': 0.959 },{'attr': 'above 65', 'afD': 0.077, 'bfD': 0.084, 'exp': 0.041 }]