### Imports

In [15]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.plotly as py
from plotly import tools
init_notebook_mode(connected=True)

### Altair
import altair as alt
alt.renderers.enable('notebook')

### Removes warnings that occassionally show up
import warnings
warnings.filterwarnings('ignore')

### Load data

In [2]:
DATA_PATH = "../../data/costa_rica/"

train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

cat_columns = [col for col in train.columns if train[col].dtype == object]
print("Categorical columns:")
print(" --- ".join(cat_columns))

### Numerical columns
num_columns = [col for col in train.columns if train[col].dtype != object]
print("Numerical columns:")
print(" --- ".join(num_columns))

print()
print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Categorical columns:
Id --- idhogar --- dependency --- edjefe --- edjefa
Numerical columns:
v2a1 --- hacdor --- rooms --- hacapo --- v14a --- refrig --- v18q --- v18q1 --- r4h1 --- r4h2 --- r4h3 --- r4m1 --- r4m2 --- r4m3 --- r4t1 --- r4t2 --- r4t3 --- tamhog --- tamviv --- escolari --- rez_esc --- hhsize --- paredblolad --- paredzocalo --- paredpreb --- pareddes --- paredmad --- paredzinc --- paredfibras --- paredother --- pisomoscer --- pisocemento --- pisoother --- pisonatur --- pisonotiene --- pisomadera --- techozinc --- techoentrepiso --- techocane --- techootro --- cielorazo --- abastaguadentro --- abastaguafuera --- abastaguano --- public --- planpri --- noelec --- coopele --- sanitario1 --- sanitario2 --- sanitario3 --- sanitario5 --- sanitario6 --- energcocinar1 --- energcocinar2 --- energcocinar3 --- energcocinar4 --- elimbasu1 --- elimbasu2 --- elimbasu3 --- elimbasu4 --- elimbasu5 --- elimbasu6 --- epared1 --- epared2 --- epared3 --- etecho1 --- etecho2 --- etecho3 --- evi

In [14]:
train.head()

4    0.627394
2    0.167103
3    0.126504
1    0.079000
Name: Target, dtype: float64

### Split into id, target, and predictors

In [4]:
train_y = train["Target"]
train_id = train["Id"]
train_x = train.drop(["Target", "Id"], axis=1)

test_id = test["Id"]
test_x  = test.drop("Id", axis=1)

full    = pd.concat([train_x, test_x])
train_N = len(train_x)

### Target

In [5]:
vc = train_y.value_counts()

trace = go.Pie(labels=vc.index, values=vc.values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(line=dict(color="black", width=2)))

fig = go.Figure(data=[trace], layout=go.Layout(title = "Target"))
iplot(fig);

### Shared households

There aren't any shared households between train and test

In [28]:
house_train = set(train["idhogar"].unique())
house_test  = set(test["idhogar"].unique())

house_train.intersection(house_test)

set()

### Number of people in household

In [30]:
train_houses = train.assign(n=1).groupby("idhogar").n.sum().value_counts()
test_houses  = test.assign(n=1).groupby("idhogar").n.sum().value_counts()

trace = go.Bar(
    x = feat_importance.Feature,
    y = feat_importance.Importance,
)

data = [trace]
layout = go.Layout(
    title  = "LGBM Feature importance",
    yaxis  = dict(title="Importance")
)

fig = go.Figure(data=data, layout=layout)
iplot(fig);

3     755
2     688
4     609
1     398
5     315
6     136
7      52
8      12
9      10
10      6
11      4
12      2
13      1
Name: n, dtype: int64

### Correlation Map

In [6]:
ignore_feats = [col for col in full.columns if ("parentesco" in col) or () or ()]
# corr         = full.drop(ignore_feats, axis=1).corr().round(3)
corr         = full.corr().round(3)

trace = go.Heatmap(
    x = corr.columns,
    y = corr.index,
    z = corr.values,
)

buttons = []

layout = dict(title = 'Correlation plots')

fig = dict(data=[trace], layout=layout)
iplot(fig)

### Feature Importance

In [12]:
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor

ignore = ["idhogar", "dependency", "edjefe", "edjefa", "Target", "Id"]
feats  = [col for col in train.columns if col not in ignore]
model = LGBMRegressor()
model.fit(train_x[feats], train_y)

feat_importance = zip(*(feats, model.feature_importances_))
feat_importance = pd.DataFrame(list(feat_importance), columns=["Feature", "Importance"])
feat_importance.sort_values("Importance", ascending=False, inplace=True)
trace = go.Bar(
    x = feat_importance.Feature,
    y = feat_importance.Importance,
)

data = [trace]
layout = go.Layout(
    title  = "LGBM Feature importance",
    yaxis  = dict(title="Importance")
)

fig = go.Figure(data=data, layout=layout)
iplot(fig);

### meaneduc

Defined as `average years of education for adults (18+)`.  The LGB model considers this the most important feature.

In [26]:
target = "Target"
feature = "meaneduc"

temp = train[train[feature].notnull()]
fig = ff.create_distplot(
    [temp[temp[target] == y][feature].values for y in temp[target].unique()], 
    temp[target].unique(), 
    show_hist=False,
    show_rug=False,
)

for d in fig['data']:
    d.update({'fill' : 'tozeroy'})

layout = go.Layout(
    title   = "Average years of education",
    xaxis   = dict(title = "Years of education"),
    yaxis   = dict(title = "Density"),
)

fig["layout"] = layout
iplot(fig)

### SQBdependency

`dependency squared`

In [27]:
target = "Target"
feature = "SQBdependency"

temp = train[train[feature].notnull()]
fig = ff.create_distplot(
    [temp[temp[target] == y][feature].values for y in temp[target].unique()], 
    temp[target].unique(), 
    show_hist=False,
    show_rug=False,
)

for d in fig['data']:
    d.update({'fill' : 'tozeroy'})

layout = go.Layout(
    title   = "Average years of education",
    xaxis   = dict(title = "Years of education"),
    yaxis   = dict(title = "Density"),
)

fig["layout"] = layout
iplot(fig)

### Rooms

In [7]:
target = "Target"
feature = "rooms"

fig = ff.create_distplot(
    [train[train[target] == y][feature].values for y in train[target].unique()], 
    train[target].unique(), 
    show_hist=False,
    show_rug=False,
)

for d in fig['data']:
    d.update({'fill' : 'tozeroy'})

layout = go.Layout(
    title   = "Rooms Distributions",
    xaxis   = dict(title = "Rooms"),
    yaxis   = dict(title = "Density"),
)

fig["layout"] = layout
iplot(fig)