In [74]:
import pandas as pd
import numpy as np
import os
DATA_DIR = "/Users/abhayranjan/scripts/GA/predicting-poverty"

data_paths = {'A': {'train': os.path.join(DATA_DIR, 'A', 'A_hhold_train.csv'),
                    'test' : os.path.join(DATA_DIR, 'A', 'A_hhold_test.csv')},
              'B': {'train': os.path.join(DATA_DIR, 'B', 'B_hhold_train.csv'),
                    'test' : os.path.join(DATA_DIR, 'B', 'B_hhold_test.csv')},
              'C' :{'train': os.path.join(DATA_DIR, 'C', 'C_hhold_train.csv'),
                    'test' : os.path.join(DATA_DIR, 'C', 'C_hhold_test.csv')}}

In [75]:
a_train = pd.read_csv(data_paths['A']['train'],index_col='id')
a_test  = pd.read_csv(data_paths['A']['test'],index_col='id')
b_train = pd.read_csv(data_paths['B']['train'],index_col='id')
b_test  = pd.read_csv(data_paths['B']['test'],index_col='id')
c_train = pd.read_csv(data_paths['C']['train'],index_col='id')
c_test  = pd.read_csv(data_paths['C']['test'],index_col='id')

In [76]:
print(a_train.poor.value_counts())
print(b_train.poor.value_counts())
print(c_train.poor.value_counts())

False    4500
True     3703
Name: poor, dtype: int64
False    3004
True      251
Name: poor, dtype: int64
False    5496
True      973
Name: poor, dtype: int64


In [77]:
def standardize(df,numeric_only=True):
    numeric = df.select_dtypes(include=['int64','float64'])
    #print(type(numeric))
    #normalize the dataset by subtracting by mean and dividing by standard deviation
    df[numeric.columns] = (numeric - numeric.mean())/numeric.std()
    return df

def pre_process(df,enforce_cols=None):
    df = standardize(df)
    df = pd.get_dummies(df)
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns,enforce_cols)
        to_add = np.setdiff1d(enforce_cols,df.columns)
        df.drop(to_drop,axis=1,inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    df.fillna(0,inplace=True)
    return df

def treat_missing_data(df):
    numeric_data = df.select_dtypes(include=['int64','float64'])
    numeric_columns = numeric_data.columns
    for numfield in numeric_columns:
        df[numfield].fillna(0,inplace=True)
    nonnumeric_data = df.select_dtypes(include=['object'])
    nonnumeric_columns = nonnumeric_data.columns
    for nonnumfield in nonnumeric_columns:
        df[nonnumfield].fillna(value=df[nonnumfield].mode(),inplace=True)
    return df

In [78]:

from sklearn.utils import resample

a_train = treat_missing_data(a_train)
a_train = pre_process(a_train)
a_test = treat_missing_data(a_test)
a_test = pre_process(a_test,enforce_cols=a_train.columns)
print(a_train.shape)
print(a_test.shape)
print(len(a_train.columns.values))
print(len(a_test.columns.values))


# Process the training data for Country B
b_train = treat_missing_data(b_train)
b_train = pre_process(b_train)
b_minority = b_train[b_train.poor == True]
b_majority = b_train[b_train.poor == False]
#Upsample minority class
b_minority_upsampled = resample(b_minority,replace=True,n_samples=3004,random_state=123)
b_train = pd.concat([b_majority,b_minority_upsampled])
b_test = treat_missing_data(b_test)
b_test = pre_process(b_test,enforce_cols=b_train.columns)
print(b_train.shape)
print(b_test.shape)
print(len(b_train.columns.values))
print(len(b_test.columns.values))


# Process the training data for Country C
c_train = treat_missing_data(c_train)
c_train = pre_process(c_train)
c_minority = c_train[c_train.poor == True]
c_majority = c_train[c_train.poor == False]
#Upsample minority class
c_minority_upsampled = resample(c_minority,replace=True,n_samples=5496,random_state=123)
c_train = pd.concat([c_majority,c_minority_upsampled])
c_test = treat_missing_data(c_test)
c_test = pre_process(c_test,enforce_cols=c_train.columns)
print(c_train.shape)
print(c_test.shape)
print(len(c_train.columns.values))
print(len(c_test.columns.values))

(8203, 860)
(4041, 860)
860
860
(6008, 1433)
(1604, 1433)
1433
1433
(10992, 796)
(3187, 796)
796
796


In [79]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# For A
#print(set(a_train.drop('poor',axis=1).columns.values) - set(a_test.drop('poor',axis=1).columns.values))
aX_train = a_train.drop('poor',axis=1)
ay_train = a_train['poor']
aX_train1,aX_test1,ay_train1,ay_test1 = train_test_split(aX_train,ay_train,test_size=0.25,random_state=243)
model = XGBClassifier()
model.fit(aX_train1,ay_train1)
a_test = a_test[aX_train.columns]
#ay_pred = model.predict_proba(aX_test1)
ay_pred1 = model.predict_proba(a_test)
print(type(ay_pred1))
print(ay_pred1.shape)
print(ay_pred1)

# For B
bX_train = b_train.drop('poor',axis=1)
by_train = b_train['poor']
bX_train1,bX_test1,by_train1,by_test1 = train_test_split(bX_train,by_train,test_size=0.25,random_state=243)
model = XGBClassifier()
model.fit(bX_train1,by_train1)
b_test = b_test[bX_train.columns]
#ay_pred = model.predict_proba(aX_test1)
by_pred1 = model.predict_proba(b_test)
print(type(by_pred1))
print(by_pred1.shape)
print(by_pred1)

# For C
cX_train = c_train.drop('poor',axis=1)
cy_train = c_train['poor']
cX_train1,cX_test1,cy_train1,cy_test1 = train_test_split(cX_train,cy_train,test_size=0.25,random_state=243)
model = XGBClassifier(n_estimators=200,max_depth=5)
model.fit(cX_train1,cy_train1)
c_test = c_test[cX_train.columns]
#ay_pred = model.predict_proba(aX_test1)
cy_pred1 = model.predict_proba(c_test)
print(type(cy_pred1))
print(cy_pred1.shape)
print(cy_pred1)

<class 'numpy.ndarray'>
(4041, 2)
[[ 0.134045    0.865955  ]
 [ 0.99322724  0.00677275]
 [ 0.24383765  0.75616235]
 ..., 
 [ 0.06526494  0.93473506]
 [ 0.89070171  0.1092983 ]
 [ 0.14841592  0.85158408]]
<class 'numpy.ndarray'>
(1604, 2)
[[ 0.89526951  0.10473049]
 [ 0.72878712  0.27121288]
 [ 0.9410373   0.05896272]
 ..., 
 [ 0.84740418  0.15259582]
 [ 0.73501045  0.26498955]
 [ 0.70341682  0.29658321]]
<class 'numpy.ndarray'>
(3187, 2)
[[  9.99794781e-01   2.05233577e-04]
 [  9.99953508e-01   4.64758159e-05]
 [  9.99988019e-01   1.19749948e-05]
 ..., 
 [  9.99865890e-01   1.34134607e-04]
 [  9.99972820e-01   2.71671106e-05]
 [  9.99864042e-01   1.35939728e-04]]


In [80]:
# Saving the submission file.

def make_country_sub(preds,test_feat,country):
    country_codes=['A','B','C']
    #Getting poor probabilities
    country_sub = pd.DataFrame(data=preds,columns=['poor'],index=test_feat.index)
    #print(country_sub)
    # add the country code for joining later
    country_sub["country"] = country
    #print(country_sub)
    return country_sub[["country","poor"]]

In [82]:
# Convert preds to dataframe
a_sub = make_country_sub(ay_pred1[:,1],a_test,'A')
b_sub = make_country_sub(by_pred1[:,1],b_test,'B')
c_sub = make_country_sub(cy_pred1[:,1],c_test,'C')

submission = pd.concat([a_sub,b_sub,c_sub])
print(submission.head(n=5))
submission.to_csv('submission.csv')

      country      poor
id                     
418         A  0.865955
41249       A  0.006773
16205       A  0.756162
97501       A  0.009254
67756       A  0.924030


In [91]:
#a_train.columns.values
for column_name in a_train.columns.values:
    x = column_name
    t = "a_train[a_train."+x+"==1].shape[0]"
    print(":",t)

: a_train[a_train.nEsgxvAq==1].shape[0]
: a_train[a_train.OMtioXZZ==1].shape[0]
: a_train[a_train.YFMZwKrU==1].shape[0]
: a_train[a_train.poor==1].shape[0]
: a_train[a_train.TiwRslOh==1].shape[0]
: a_train[a_train.wBXbHZmp_DkQlr==1].shape[0]
: a_train[a_train.wBXbHZmp_JhtDR==1].shape[0]
: a_train[a_train.SlDKnCuu_GUusz==1].shape[0]
: a_train[a_train.SlDKnCuu_alLXR==1].shape[0]
: a_train[a_train.KAJOWiiw_BIZns==1].shape[0]
: a_train[a_train.KAJOWiiw_TuovO==1].shape[0]
: a_train[a_train.KAJOWiiw_rqUAG==1].shape[0]
: a_train[a_train.DsKacCdL_QGgpH==1].shape[0]
: a_train[a_train.DsKacCdL_ZYabk==1].shape[0]
: a_train[a_train.rtPrBBPl_feupP==1].shape[0]
: a_train[a_train.rtPrBBPl_uxuSS==1].shape[0]
: a_train[a_train.tMJrvvut_PHMVg==1].shape[0]
: a_train[a_train.tMJrvvut_VYpgU==1].shape[0]
: a_train[a_train.jdetlNNF_NDTCU==1].shape[0]
: a_train[a_train.jdetlNNF_PUBDZ==1].shape[0]
: a_train[a_train.maLAYXwi_TcuXF==1].shape[0]
: a_train[a_train.maLAYXwi_cLAGr==1].shape[0]
: a_train[a_train.maLA