In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the data

In [None]:
data = pd.read_csv('../input/costa-rican-household-poverty-prediction/train.csv')

In [None]:
data.head()

Let's take a look at different dtypes in the dataframe.

In [None]:
data.info()

Hmmm, looks like there are 5 objects, 130 integers, and 8 floats.

Let's take a look at our categorical features.

In [None]:
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

There are 5 categorical features, including `Id`, which we will set as index later.

Now this is a very important step, always make a copy of your original dataframe, so that if something goes wrong, we still have a back-up data.

In [None]:
df = data.copy()

Let's look at the distribution of our target variable.

In [None]:
print(df['Target'].value_counts())
df['Target'].value_counts().plot(kind='bar')

Ok, look's like our target variable is unbalanced.

One of the main characteristic a data scientist must have is to write a clean, readable block of code.

We can do that by defining functions, like this:

In [None]:
def preprocess_data(df):    
    # Fill numeric rows with the median
    df.drop('Id', axis=1)
    df.set_index('Id', inplace=True)
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                #df[label+"_is_missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())
                
        # Turn categorical variables into numbers
        if not pd.api.types.is_numeric_dtype(content):
            #df[label+"_is_missing"] = pd.isnull(content)
            # We add the +1 because pandas encodes missing categories as -1
            df[label] = pd.Categorical(content).codes+1        
    
    return df
preprocess_data(df)

This looks clean enough, now there are different approaches to any given problem, always try with different approaches before finalizing an approach.

## Feature Engineering

Let's split our data into X & Y, so that we can later split it into train and validation sets.

In [None]:
X = df.drop('Target', axis=1)
y = df['Target']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

After trying with different ML algorithms, I feel that these two work like a charm on this dataset.

## Random Forest

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print('R2 score is : {:.2f}'.format(accuracy_score(y_test, rf_pred)))
print('\n')
print("Classification Report : ")
print(classification_report(y_test,rf_pred))

Hmmm, this looks good. The model seems to be learning well as you can look at the f1-scores for each classes.

Let's take a look at other algorithms before jumping to conclusions.

## ExtraTreesClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier().fit(X_train, y_train)
etc_pred = etc.predict(X_test)
print('R2 score is : {:.2f}'.format(accuracy_score(y_test, etc_pred)))
print('\n')
print("Classification Report : ")
print(classification_report(y_test,etc_pred))

WOWW ! There's an improvement. I mean just look at the f1-scores.

I've tried with different models but, this seems to work really good. So I'll keep this model.

Wait a minute, our work isn't done yet.

A good data scientist should be able to build a model, which can produce amazing results even with lesser data.
As we can see `ExtraTreesClassifier` did really good let's see the what are top 10 important features, and see if we can achieve the same results as compared to using all the features.

In [None]:
etc.feature_importances_

I know this is not readable, so let's visualize it.

In [None]:
import seaborn as sns

# Helper function for plotting feature importance
def plot_features(columns, importances, n=10):
    df = (pd.DataFrame({"features": columns,
                        "feature_importance": importances})
          .sort_values("feature_importance", ascending=False)
          .reset_index(drop=True))
    
    sns.barplot(x="feature_importance",
                y="features",
                data=df[:n],
                orient="h")
plot_features(X_train.columns, etc.feature_importances_)

Hmmm, top 10 features according to our model are : 
                                                    
`'meaneduc',
'SQBmeaned',
'hogar_nin',
'SQBhogar_nin',
'cielorazo',
'qmobilephone',
'idhogar',
'overcrowding',
'r4t1',
'SQBdependency'`.
                                                    
So let's just use these 10 features and see if the model still works good.

In [None]:
new_data = df[['meaneduc','SQBmeaned','hogar_nin','SQBhogar_nin','cielorazo',
               'qmobilephone','idhogar','overcrowding','r4t1','SQBdependency']]

In [None]:
new_data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_data,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
etc = ExtraTreesClassifier().fit(X_train, y_train)
etc_pred = etc.predict(X_test)
print('R2 score is : {:.2f}'.format(accuracy_score(y_test, etc_pred)))
print('\n')
print("Classification Report : ")
print(classification_report(y_test,etc_pred))

Nice, still works like a charm.

Now It's time to predict on the test data.

In [None]:
test = pd.read_csv('../input/costa-rican-household-poverty-prediction/test.csv')

Again, making a copy of test set.

In [None]:
test_df = test.copy()

Looking at the information of test data...

In [None]:
test_df.info()

It's similar to our training data.

Looking at categorical features.

In [None]:
for label, content in test_df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

Ok, we have 5 categorical features, let's set `Id` as index and drop it.

Better yet, since we've already done this while training our model using `preprocess_data`, let's use the same function here.

In [None]:
preprocess_data(test_df)

Beautiful, everything looks good.

Now let's predict it on our test set.

In [None]:
pred = rf.predict(test_df)
pred = pd.DataFrame(pred)
pred.to_csv('submission.csv')

In [None]:
pred

In [None]:
Id = test['Id']
Id = pd.DataFrame(Id)

In [None]:
Id

In [None]:
subs = pd.concat([id, pred], ignore_index=True, axis=1)

In [None]:
subs

In [None]:
subs.rename(columns={'0':'ID','1':'Target'}, inplace=True)

In [None]:
subs.columns = ['Id','Target']

In [None]:
subs.drop

In [None]:
s = subs.copy()

In [None]:
s.reset_index(drop=True, inplace=True)

In [None]:
s.set_index('Id', inplace=True)

In [None]:
subs = s

In [None]:
subs

In [None]:
subs.to_csv('submission.csv')