In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
train = pd.read_csv("../input/train.csv")

In [None]:
test = pd.read_csv("../input/test.csv")

## First look

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

### So we have float64(8), int64(129), object(5)

## Let's take a closer look at type 'object'

In [None]:
train.select_dtypes(include='object').head()

In [None]:
train['Id'].size

### We have 9557 unique id, no duplicates

In [None]:
train['idhogar'].unique().size

In [None]:
households = train.groupby('idhogar').apply(lambda x: len(x))
print(households.describe())
plt.hist(households, bins=range(1, 13), align='left')
plt.xlabel("Number of household's members")
plt.ylabel('Number of households')
plt.grid(True)
plt.xlim([1, 13])
plt.xticks(range(1, 14))
plt.show()

### We have 2988 households in our dataset each of which has 3 members on average with a maximum of 13 members. 

### Explore 'dependency', 'edjefe', 'edjefa'

In [None]:
print(train['dependency'].unique())
print(train['edjefe'].unique())
print(train['edjefa'].unique())

### Change 'no' = 0 and 'yes' = 1 and convert to float

In [None]:
def change_and_convert_object(df):
    di = {"yes": 1, "no": 0}
    df['dependency'].replace(di, inplace=True)
    df['edjefe'].replace(di, inplace=True)
    df['edjefa'].replace(di, inplace=True)
    
    df['dependency'] = df['dependency'].astype(float)
    df['edjefe'] = df['edjefe'].astype(float)
    df['edjefa'] = df['edjefa'].astype(float)

In [None]:
change_and_convert_object(train)
change_and_convert_object(test)

### Check 'train' data

In [None]:
print(train['dependency'].unique())
print(train['edjefe'].unique())
print(train['edjefa'].unique())

### Let's draw boxplot for 'dependency', 'edjefe', 'edjefa' in 'train' data

In [None]:
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
ax3 = fig.add_subplot(313)

sns.boxplot(x='dependency', data=train, color='g', ax=ax1);
sns.boxplot(x='edjefe', data=train, color='r',ax=ax2);
sns.boxplot(x='edjefa', data=train, ax=ax3);

plt.tight_layout()

### Check 'test' data

In [None]:
print(test['dependency'].unique())
print(test['edjefe'].unique())
print(test['edjefa'].unique())

## Let's take a closer look at numerical columns

In [None]:
train.describe()

## Missing data

In [None]:
numerical = train.select_dtypes(exclude='object').columns
train[numerical].isnull().sum().sort_values(ascending = False).head(10)

### Explore 'SQBmeaned', 'meaneduc'

In [None]:
train[['SQBmeaned', 'meaneduc']].describe()

### Fill 'meaneduc' with mean value and 'SQBmeaned' with meadian value

In [None]:
def fill_meaneduc_and_SQBmeaned(df):
    df['meaneduc'].fillna(df['meaneduc'].mean(), inplace = True)
    df['SQBmeaned'].fillna(df['SQBmeaned'].median(), inplace = True)

In [None]:
fill_meaneduc_and_SQBmeaned(train)
fill_meaneduc_and_SQBmeaned(test)

### Other missing values fill with 0

In [None]:
def fill_with_zero(df):
    df['rez_esc'].fillna(0, inplace = True)
    df['v18q1'].fillna(0, inplace = True)
    df['v2a1'].fillna(0, inplace = True)

In [None]:
fill_with_zero(train)
fill_with_zero(test)

### Check 'train' data

In [None]:
train[numerical].isnull().sum().sum()

### Check 'test' data

In [None]:
numerical = test.select_dtypes(exclude='object').columns
test[numerical].isnull().sum().sum()

## Let's take a closer look at 'Target'

### In progress...

## Feature Engineering

In [None]:
def add_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    df['rent_to_bedrooms'] = df['v2a1']/df['bedrooms']
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
    df['tamhog_to_bedrooms'] = df['tamhog']/df['bedrooms']
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms']
    df['r4t3_to_bedrooms'] = df['r4t3']/df['bedrooms']
    df['rent_to_r4t3'] = df['v2a1']/df['r4t3']
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1'])
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms']
    df['hhsize_to_bedrooms'] = df['hhsize']/df['bedrooms']
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize']
    df['qmobilephone_to_r4t3'] = df['qmobilephone']/df['r4t3']
#     df['qmobilephone_to_v18q1'] = df['qmobilephone']/df['v18q1']

In [None]:
add_features(train)
add_features(test)

### Explore households

## Prediction

### Save Id from 'test' data for future

In [None]:
Id = test[['Id']]

 ### Select X, y from 'train' data

In [None]:
y = train['Target']
train.drop(['Target', 'Id', 'idhogar'], axis=1, inplace=True)
X = train[train.columns]

### Random Forect Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
rf = RandomForestClassifier(random_state=17, n_jobs=-1).fit(X, y)

In [None]:
accuracy_score(y, rf.predict(X))

In [None]:
test.drop(['Id', 'idhogar'], axis=1, inplace=True)

In [None]:
X_test = test[test.columns]

In [None]:
rf_pred = rf.predict(X_test)
# rf_pred

In [None]:
d = {'Id': Id['Id'], 'Target': rf_pred}

In [None]:
submission_df = pd.DataFrame(data=d)
# submission_df

In [None]:
submission_df.to_csv('submission.csv', sep=',', index=False)