### **Importando as bibliotecas e bases**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### **Carregando os dados**

In [None]:
df = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')

df.shape, test.shape

In [None]:
df_all = df.append(test)

df_all.shape

In [None]:
df_all.info()

In [None]:
df_all.info(max_cols=145)

In [None]:
df_all.select_dtypes('object').head()

In [None]:
df_all['dependency'].value_counts()

In [None]:
df_all['edjefa'].value_counts()

In [None]:
df_all['edjefe'].value_counts()

In [None]:
mapeamento = {'yes': 1, 'no': 0}

df_all['edjefa'] = df_all['edjefa'].replace(mapeamento).astype(int)
df_all['edjefe'] = df_all['edjefe'].replace(mapeamento).astype(int)

In [None]:
df_all.select_dtypes('object').head()

In [None]:
df_all['dependency'] = df_all['dependency'].replace(mapeamento).astype(float)

In [None]:
df_all.select_dtypes('object').head()

In [None]:
df_all.isnull().sum().sort_values()

In [None]:
df_all[df_all['parentesco1'] == 1]['v2a1'].isnull().sum()

In [None]:
df_all['v18q'].value_counts()

In [None]:
df_all['v2a1'].fillna(-1, inplace=True)

In [None]:
df_all['v18q1'].fillna(0, inplace=True)

In [None]:
df_all.isnull().sum().sort_values()

In [None]:
df_all['SQBmeaned'].fillna(-1, inplace=True)
df_all['meaneduc'].fillna(-1, inplace=True)
df_all['rez_esc'].fillna(-1, inplace=True)


In [None]:
df_all.isnull().sum().sort_values()


## **Treinamento dos Dados**

In [None]:
feats = [c for c in df_all.columns if c not in ['Id', 'idhogar', 'Target']]

In [None]:
train, test = df_all[~df_all['Target'].isnull()], df_all[df_all['Target'].isnull()]

train.shape, test.shape

In [None]:
sns.histplot(data=train, x="Target", bins = 8)
plt.show()

In [None]:
train['Target'].value_counts()

In [None]:
train['Target'].value_counts(normalize=True)

## **Feature Engineering** e realizar over_sampling

In [None]:
X, y = train[feats], train[['Target']]

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_ros,y_ros= ros.fit_resample(X,y)

y_ros['Target'].value_counts()

## **Treinando com XGBoost**

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=250, learning_rate=0.09, random_state=42)

# Treinar
xgb.fit(X_ros, y_ros)

## **Verificando as previsões**

In [None]:
test['Target'].value_counts()

In [None]:
test['Target'].value_counts(normalize=True)

In [None]:
previsao = test['Target'] = xgb.predict(test[feats]).astype(int)

In [None]:
previsao

## **Arquivo para submissão** 

In [None]:
test[['Id', 'Target']].to_csv('submission.csv', index=False)

## **Avaliar importancia de cada coluna** 

In [None]:
#import matplotlib.pyplot as plt
fig=plt.figure(figsize=(25,30))
    
pd.Series(xgb.feature_importances_, index=feats).sort_values().plot.barh()