### Imports

In [11]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.plotly as py
from plotly import tools
init_notebook_mode(connected=True)

### Removes warnings that occassionally show up
import warnings
warnings.filterwarnings('ignore')

### Load data

In [12]:
DATA_PATH = "../../data/costa_rica/"

train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

cat_columns = [col for col in train.columns if train[col].dtype == object]
print("Categorical columns:")
print(" --- ".join(cat_columns))

### Numerical columns
num_columns = [col for col in train.columns if train[col].dtype != object]
print("Numerical columns:")
print(" --- ".join(num_columns))

print()
print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Categorical columns:
Id --- idhogar --- dependency --- edjefe --- edjefa
Numerical columns:
v2a1 --- hacdor --- rooms --- hacapo --- v14a --- refrig --- v18q --- v18q1 --- r4h1 --- r4h2 --- r4h3 --- r4m1 --- r4m2 --- r4m3 --- r4t1 --- r4t2 --- r4t3 --- tamhog --- tamviv --- escolari --- rez_esc --- hhsize --- paredblolad --- paredzocalo --- paredpreb --- pareddes --- paredmad --- paredzinc --- paredfibras --- paredother --- pisomoscer --- pisocemento --- pisoother --- pisonatur --- pisonotiene --- pisomadera --- techozinc --- techoentrepiso --- techocane --- techootro --- cielorazo --- abastaguadentro --- abastaguafuera --- abastaguano --- public --- planpri --- noelec --- coopele --- sanitario1 --- sanitario2 --- sanitario3 --- sanitario5 --- sanitario6 --- energcocinar1 --- energcocinar2 --- energcocinar3 --- energcocinar4 --- elimbasu1 --- elimbasu2 --- elimbasu3 --- elimbasu4 --- elimbasu5 --- elimbasu6 --- epared1 --- epared2 --- epared3 --- etecho1 --- etecho2 --- etecho3 --- evi

In [13]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


### Fix outliers

In [14]:
#outlier in test set which rez_esc is 99.0
test.loc[test['rez_esc'] == 99.0 , 'rez_esc'] = 5

### Split into id, target, and predictors

In [15]:
train_y = train["Target"]
train_id = train["Id"]
train_x = train.drop(["Target", "Id"], axis=1)

test_id = test["Id"]
test_x  = test.drop("Id", axis=1)

full    = pd.concat([train_x, test_x])
train_N = len(train_x)

### Missing values

In [16]:
#Fill na
def repalce_v18q1(x):
    if x['v18q'] == 0:
        return x['v18q']
    else:
        return x['v18q1']

full['v18q1'] = full.apply(lambda x : repalce_v18q1(x),axis=1)
full['v2a1'] = full['v2a1'].fillna(value=full['tipovivi3'])

### Feature Engineering

In [17]:
def temp_map(x):
    if x == "no": return 0
    if x == "yes": return 1
    return x

cols = ['edjefe', 'edjefa']
for col in cols:
    full[col] = full[col].map(temp_map)

def fill_roof_exception(x):
    if (x['techozinc'] == 0) and (x['techoentrepiso'] == 0) and (x['techocane'] == 0) and (x['techootro'] == 0):
        return 1
    else:
        return 0
    
def fill_no_electricity(x):
    if (x['public'] == 0) and (x['planpri'] == 0) and (x['noelec'] == 0) and (x['coopele'] == 0):
        return 1
    else:
        return 0

full['roof_waste_material'] = np.nan
full['electricity_other']   = np.nan
full['roof_waste_material'] = full.apply(lambda x : fill_roof_exception(x),axis=1)
full['electricity_other']   = full.apply(lambda x : fill_no_electricity(x),axis=1)

full['adult'] = full['hogar_adul'] - full['hogar_mayor']
full['dependency_count'] = full['hogar_nin'] + full['hogar_mayor']
full['dependency'] = full['dependency_count'] / full['adult']
full['child_percent'] = full['hogar_nin']/full['hogar_total']
full['elder_percent'] = full['hogar_mayor']/full['hogar_total']
full['adult_percent'] = full['hogar_adul']/full['hogar_total']
full['rent_per_adult'] = full['v2a1']/full['hogar_adul']
full['rent_per_person'] = full['v2a1']/full['hhsize']
full['overcrowding_room_and_bedroom'] = (full['hacdor'] + full['hacapo'])/2
full['no_appliances'] = full['refrig'] + full['computer'] + full['television']
full['r4h1_percent_in_male'] = full['r4h1'] / full['r4h3']
full['r4m1_percent_in_female'] = full['r4m1'] / full['r4m3']
full['r4h1_percent_in_total'] = full['r4h1'] / full['hhsize']
full['r4m1_percent_in_total'] = full['r4m1'] / full['hhsize']
full['r4t1_percent_in_total'] = full['r4t1'] / full['hhsize']
full['rent_per_room'] = full['v2a1']/full['rooms']
full['bedroom_per_room'] = full['bedrooms']/full['rooms']
full['elder_per_room'] = full['hogar_mayor']/full['rooms']
full['adults_per_room'] = full['adult']/full['rooms']
full['child_per_room'] = full['hogar_nin']/full['rooms']
full['male_per_room'] = full['r4h3']/full['rooms']
full['female_per_room'] = full['r4m3']/full['rooms']
full['room_per_person_household'] = full['hhsize']/full['rooms']
full['rent_per_bedroom'] = full['v2a1']/full['bedrooms']
full['edler_per_bedroom'] = full['hogar_mayor']/full['bedrooms']
full['adults_per_bedroom'] = full['adult']/full['bedrooms']
full['child_per_bedroom'] = full['hogar_nin']/full['bedrooms']
full['male_per_bedroom'] = full['r4h3']/full['bedrooms']
full['female_per_bedroom'] = full['r4m3']/full['bedrooms']
full['bedrooms_per_person_household'] = full['hhsize']/full['bedrooms']
full['tablet_per_person_household'] = full['v18q1']/full['hhsize']
full['phone_per_person_household'] = full['qmobilephone']/full['hhsize']
full['age_12_19'] = full['hogar_nin'] - full['r4t1']  
full['escolari_age'] = full['escolari']/full['age']
full['rez_esc_escolari'] = full['rez_esc']/full['escolari']
full['rez_esc_r4t1'] = full['rez_esc']/full['r4t1']
full['rez_esc_r4t2'] = full['rez_esc']/full['r4t2']
full['rez_esc_r4t3'] = full['rez_esc']/full['r4t3']
full['rez_esc_age'] = full['rez_esc']/full['age']

full['dependency'] = full['dependency'].replace({np.inf: 0})

### Drop some features

In [18]:
feats = ['idhogar', 'tamhog', 'agesq', "edjefe", "edjefa",
         'hogar_adul', 'SQBescolari', 'SQBage',
         'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin',
         'SQBovercrowding', 'SQBdependency', 'SQBmeaned']
full.drop(feats, axis=1, inplace=True)

### Split full back into train and test

In [19]:
train_x = full[:train_N]
test_x  = full[train_N:]

### Processed data look
train_x.head()

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,bedrooms_per_person_household,tablet_per_person_household,phone_per_person_household,age_12_19,escolari_age,rez_esc_escolari,rez_esc_r4t1,rez_esc_r4t2,rez_esc_r4t3,rez_esc_age
0,190000.0,0,3,0,1,1,0,0.0,0,1,...,1.0,0.0,1.0,0,0.232558,,,,,
1,135000.0,0,4,0,1,1,1,1.0,0,1,...,1.0,1.0,1.0,0,0.179104,,,,,
2,0.0,0,8,0,1,1,0,0.0,0,0,...,0.5,0.0,0.0,0,0.119565,,,,,
3,180000.0,0,5,0,1,1,1,1.0,0,2,...,1.333333,0.25,0.75,1,0.529412,0.111111,1.0,0.333333,0.25,0.058824
4,180000.0,0,5,0,1,1,1,1.0,0,2,...,1.333333,0.25,0.75,1,0.297297,,,,,


### Modeling

In [20]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

model = LGBMRegressor()
model.fit(train_x, train_y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

### Predictions

In [21]:
predictions = model.predict(test_x).round().astype(int)
pd.DataFrame({
    "Id": test_id,
    "Target": predictions
}).to_csv("more_feats_gaxx.csv", index=False)