### Imports

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.plotly as py
from plotly import tools
init_notebook_mode(connected=True)

### Removes warnings that occassionally show up
import warnings
warnings.filterwarnings('ignore')

### Load data

In [3]:
DATA_PATH = "../../data/costa_rica/"

train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

cat_columns = [col for col in train.columns if train[col].dtype == object]
print("Categorical columns:")
print(" --- ".join(cat_columns))

### Numerical columns
num_columns = [col for col in train.columns if train[col].dtype != object]
print("Numerical columns:")
print(" --- ".join(num_columns))

print()
print("Shape of train:", train.shape)
print("Shape of test:",  test.shape)

Categorical columns:
Id --- idhogar --- dependency --- edjefe --- edjefa
Numerical columns:
v2a1 --- hacdor --- rooms --- hacapo --- v14a --- refrig --- v18q --- v18q1 --- r4h1 --- r4h2 --- r4h3 --- r4m1 --- r4m2 --- r4m3 --- r4t1 --- r4t2 --- r4t3 --- tamhog --- tamviv --- escolari --- rez_esc --- hhsize --- paredblolad --- paredzocalo --- paredpreb --- pareddes --- paredmad --- paredzinc --- paredfibras --- paredother --- pisomoscer --- pisocemento --- pisoother --- pisonatur --- pisonotiene --- pisomadera --- techozinc --- techoentrepiso --- techocane --- techootro --- cielorazo --- abastaguadentro --- abastaguafuera --- abastaguano --- public --- planpri --- noelec --- coopele --- sanitario1 --- sanitario2 --- sanitario3 --- sanitario5 --- sanitario6 --- energcocinar1 --- energcocinar2 --- energcocinar3 --- energcocinar4 --- elimbasu1 --- elimbasu2 --- elimbasu3 --- elimbasu4 --- elimbasu5 --- elimbasu6 --- epared1 --- epared2 --- epared3 --- etecho1 --- etecho2 --- etecho3 --- evi

In [4]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


### Split into id, target, and predictors

In [5]:
train_y = train["Target"]
train_id = train["Id"]
train_x = train.drop(["Target", "Id"], axis=1)

test_id = test["Id"]
test_x  = test.drop("Id", axis=1)

full    = pd.concat([train_x, test_x])
train_N = len(train_x)

### Target

In [6]:
vc = train_y.value_counts()

trace = go.Pie(labels=vc.index, values=vc.values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=20),
               marker=dict(line=dict(color="black", width=2)))

fig = go.Figure(data=[trace], layout=go.Layout(title = "Target"))
iplot(fig);

### Correlation Map

In [9]:
ignore_feats = [col for col in full.columns if ("parentesco" in col) or () or ()]
# corr         = full.drop(ignore_feats, axis=1).corr().round(3)
corr         = full.corr().round(3)

trace = go.Heatmap(
    x = corr.columns,
    y = corr.index,
    z = corr.values,
)

buttons = []

layout = dict(title = 'Correlation plots')

fig = dict(data=[trace], layout=layout)
iplot(fig)

###

In [10]:
target = "Target"
feature = "rooms"

fig = ff.create_distplot(
    [train[train[target] == y][feature].values for y in train[target].unique()], 
    train[target].unique(), 
    show_hist=False,
    show_rug=False,
)

for d in fig['data']:
    d.update({'fill' : 'tozeroy'})

layout = go.Layout(
    title   = "Rooms Distributions",
    xaxis   = dict(title = "Rooms"),
    yaxis   = dict(title = "Density"),
)

fig["layout"] = layout
iplot(fig)