# IESB - Miner II - Aula 05 - Random Forest
Por: Ana Souza Matricula: 1931133141

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Carregando os dados
train = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
test = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/test.csv')

train.shape, test.shape

In [None]:
# Juntando os dataframes
df_all = train.append(test)
df_all.shape

In [None]:
#tipos dos dados
df_all.dtypes.unique()

Temos três tipos de dados na planilha

In [None]:
# Quais colunas do dataframe são do tipo object
df_all.select_dtypes('object').head()

5 colunas são do tipo object: 
> 
    Id: a unique identifier for each individual.
    idhogar: a unique identifier for each household. Group individuals by household.
    parentesco1: indicates if person is the head of the household or not. 
    Target: the label, which should be equal for all members in a household

> 
    dependency: Dependency rate, calculated = (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)
    edjefe: years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
    edjefa: years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0


Substituindo valores yes e no por 1 e 0

In [None]:
mapping = {"yes": 1, "no": 0}

# Apply same operation to both train and test
for df in [train, test]:
    # Fill in the values with the correct mapping
    df['dependency'] = df['dependency'].replace(mapping).astype(np.float64)
    df['edjefa'] = df['edjefa'].replace(mapping).astype(np.float64)
    df['edjefe'] = df['edjefe'].replace(mapping).astype(np.float64)

train[['dependency', 'edjefa', 'edjefe']].describe()

In [None]:
# Quais colunas do dataframe são do tipo int
df_all.select_dtypes('int').head()

129 colunas são do tipo int

In [None]:
df_all.select_dtypes(np.int64).nunique().value_counts().sort_index().plot.bar(color = 'pink', 
                                                                             figsize = (8, 6),
                                                                            edgecolor = 'k', linewidth = 2);
plt.xlabel('Number of Unique Values'); plt.ylabel('Count');
plt.title('Count of Unique Values in Integer Columns');

É possível notar que grande parte das colunas int são binarias

In [None]:
# Quais colunas do dataframe são do tipo float
df_all.select_dtypes('float').head()

In [None]:
from collections import OrderedDict

plt.figure(figsize = (20, 16))
plt.style.use('fivethirtyeight')

# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
poverty_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 4: 'non vulnerable'})

# Iterate through the float columns
for i, col in enumerate(train.select_dtypes('float')):
    ax = plt.subplot(4, 2, i + 1)
    # Iterate through the poverty levels
    for poverty_level, color in colors.items():
        # Plot each poverty level as a separate line
        sns.kdeplot(train.loc[train['Target'] == poverty_level, col].dropna(), 
                    ax = ax, color = color, label = poverty_mapping[poverty_level])
        
    plt.title(f'{col.capitalize()} Distribution'); plt.xlabel(f'{col}'); plt.ylabel('Density')

plt.subplots_adjust(top = 2)

meaneduc distribution (media de escolaridade) parece estar mais relacionada ao nivel de pobreza

Precisamos adicionar a coluna de valores nulos para nossa target no dataset de teste

In [None]:
# Adicionar coluna de nulo no test
test['Target'] = np.nan
data = train.append(test, ignore_index = True)

Distribuição dos dados

In [None]:
from collections import OrderedDict

# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
# Poverty Mapping
poverty_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 4: 'non vulnerable'})

# Heads of household
heads = data.loc[data['parentesco1'] == 1].copy()

# Labels for training
train_labels = data.loc[(data['Target'].notnull()) & (data['parentesco1'] == 1), ['Target', 'idhogar']]

# Value counts of target
label_counts = train_labels['Target'].value_counts().sort_index()

# Bar plot of occurrences of each label
label_counts.plot.bar(figsize = (8, 6), 
                      color = colors.values(),
                      edgecolor = 'k', linewidth = 2)

# Formatting
plt.xlabel('Poverty Level'); plt.ylabel('Count'); 
plt.xticks([x - 1 for x in poverty_mapping.keys()], 
           list(poverty_mapping.values()), rotation = 60)
plt.title('Poverty Level Breakdown');

label_counts

A maior parte dos nossos dados apresentam valores non-vulterable

In [None]:
# Olhando a coluna dependency
df_all['dependency'].value_counts()

In [None]:
# Analisando os dados da coluna edjefa
df_all['edjefa'].value_counts()

In [None]:
# Analisando os dados da coluna edjefe
df_all['edjefe'].value_counts()

Nas colunas edjefe, edjefa e dependency, temos valores yes e no que gostariamos de substituir por valores int

In [None]:
# Vamos transformar 'yes' em 1 e 'no' em 0
# nas colunas edjefa e edjefe
mapeamento = {'yes': 1, 'no': 0}

df_all['edjefa'] = df_all['edjefa'].replace(mapeamento).astype(float)
df_all['edjefe'] = df_all['edjefe'].replace(mapeamento).astype(float)
df_all['dependency'] = df_all['dependency'].replace(mapeamento).astype(float)

In [None]:
df_all[['dependency', 'edjefa', 'edjefe']].describe()

Para garantir que nossos dados de treino e teste estarao ok, vou separa-los somente apos o tratamento do df_all

Existem alguns problemas de labels errados que podemos encontrar em datasets reais. Neste caso, devemos utilizar o head of household as true label. Dessa forma, pessoas com o mesmo head devem ter o mesmo label. 

In [None]:
# Separar os dataframes
train, test = df_all[~df_all['Target'].isnull()], df_all[df_all['Target'].isnull()]

train.shape, test.shape

In [None]:
# Groupby the household and figure out the number of unique values
all_equal = train.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

# Households where targets are not all equal
not_equal = all_equal[all_equal != True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

Para corrigir o erro onde pessoas de uma mesma casa tem diferentes valores target, podemos reassing os valores de acordo com o valor onde parentesco1 == 1.

Outro erro que pode ocorrer é o caso de households sem uma head, o que nesse caso, dificultaria a chance de ter uma target. O caderno "A complete introduction and walkthrough" apresenta uma forma de resolver o problema.

In [None]:
households_leader = train.groupby('idhogar')['parentesco1'].sum()

# Find households without a head
households_no_head = train.loc[train['idhogar'].isin(households_leader[households_leader == 0].index), :]

print('There are {} households without a head.'.format(households_no_head['idhogar'].nunique()))

No caso do código acima, a estratégia utilizada envolve somar os valores de parentesco1, sabendo que se a soma for 0, significa que não houve designação de head. Caso os labels fossem diferentes para casa sem head, não saberíamos como reassing a nova target.

Outra necessidade que temos que ainda trata do handle da limpeza de dados, é o tratamento de valores nulos. Primeiramente é necessaŕio verificar a quantidade de valores nulos, o que eu gosto de fazer pela porcentagem. Depois, é necessário esoclher um método de acordo com o problema que estamos resolvendo - e decidir se iremos substituir os valores missing ou retirá-los do dataset.

In [None]:
missing = pd.DataFrame(data.isnull().sum()).rename(columns = {0: 'total'})
missing['percent'] = missing['total'] / len(data)
missing.sort_values('percent', ascending = False).head(10).drop('Target')

De acordo com o resultado, as principais colunas que devemos tratar sao as rez_esc, v18q1, v2a1, SQBmeaned e meaneduc. Como as duas ultimas tem uma porcentagem baixa de nulos, as que realmente precisam de tratamento seriam as tres primeiras.

> v18q1: Numero de tablets por familia. 
v2a1: Pagamento de alguel mensal.
rez_esc: anos atrasados na escolaridade.

In [None]:
def plot_value_counts(df, col, heads_only = False):
    """Plot value counts of a column, optionally with only the heads of a household"""
    # Select heads of household
    if heads_only:
        df = df.loc[df['parentesco1'] == 1].copy()
        
    plt.figure(figsize = (8, 6))
    df[col].value_counts().sort_index().plot.bar(color = 'blue',
                                                 edgecolor = 'k',
                                                 linewidth = 2)
    plt.xlabel(f'{col}'); plt.title(f'{col} Value Counts'); plt.ylabel('Count')
    plt.show();

In [None]:
plot_value_counts(heads, 'v18q1')

A grande maioria das casas tem 1 tablet. Entretanto, talvez o esperado fosse que a maioria ou uma parte proxima disto, teria nenhum tablet. O que não é apresentado aqui. Isto pode indicar então, que a quantidade de valores missing são, na verdade, valores 0. Assim, irei substituir valores NaN por zeros.

In [None]:
data['v18q1'] = data['v18q1'].fillna(0)

Para tratar da proxima coluna, referente ao aluguel, precisamos selecionar casas que sejam alugadas. A coluna para isso é a tipovivi_. Tambem podemos suspeitar de inicio, que valores NaN indiquem casas que nao precisam de alguel.

In [None]:
# Variables indicating home ownership
own_variables = [x for x in data if x.startswith('tipo')]


# Plot of the home ownership variables for home missing rent payments
data.loc[data['v2a1'].isnull(), own_variables].sum().plot.bar(figsize = (10, 8),
                                                                        color = 'green',
                                                              edgecolor = 'k', linewidth = 2);
plt.xticks([0, 1, 2, 3, 4],
           ['Owns and Paid Off', 'Owns and Paying', 'Rented', 'Precarious', 'Other'],
          rotation = 60)
plt.title('Home Ownership Status for Households Missing Rent Payments', size = 18);

> tipovivi1, =1 own and fully paid house
tipovivi2, "=1 own,  paying in installments"
tipovivi3, =1 rented
tipovivi4, =1 precarious
tipovivi5, "=1 other(assigned,  borrowed)"

Para casas que sao owned, podemos substituir o valor missing. Ja para casas que nao sao owned e tem valores missing, podemos adicionar uma indicacao de que ela nao tem valor.

In [None]:
# Fill in households that own the house with 0 rent payment
data.loc[(data['tipovivi1'] == 1), 'v2a1'] = 0

# Create missing rent payment column
data['v2a1-missing'] = data['v2a1'].isnull()

data['v2a1-missing'].value_counts()

Valores nulos em rez_esc podem indicar casas sem crianças.

In [None]:
data.loc[data['rez_esc'].notnull()]['age'].describe()

A idade mais velha que temos de nao missings é de 17, ou seja, até 17 anos temos nao missing e depois começamos a ter.

In [None]:
# If individual is over 19 or younger than 7 and missing years behind, set it to 0
data.loc[((data['age'] > 19) | (data['age'] < 7)) & (data['rez_esc'].isnull()), 'rez_esc'] = 0

# Add a flag for those between 7 and 19 with a missing value
data['rez_esc-missing'] = data['rez_esc'].isnull()

In [None]:
data.loc[data['rez_esc'] > 5, 'rez_esc'] = 5

Podemos querer remover variavies que sao muito redundates para nosso modelo

In [None]:
# Create correlation matrix
corr_matrix = heads.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]

to_drop


In [None]:
corr_matrix.loc[corr_matrix['tamhog'].abs() > 0.9, corr_matrix['tamhog'].abs() > 0.9]

In [None]:
heads = heads.drop(columns = ['tamhog', 'hogar_total', 'r4t3'])

Para feature, podemos observar a diferença do tamanho da casa e da quantidade de pessoas dentro do household 

In [None]:
def plot_categoricals(x, y, data, annotate = True):
    """Plot counts of two categoricals.
    Size is raw count for each grouping.
    Percentages are for a given value of y."""
    
    # Raw counts 
    raw_counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize = False))
    raw_counts = raw_counts.rename(columns = {x: 'raw_count'})
    
    # Calculate counts for each group of x and y
    counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize = True))
    
    # Rename the column and reset the index
    counts = counts.rename(columns = {x: 'normalized_count'}).reset_index()
    counts['percent'] = 100 * counts['normalized_count']
    
    # Add the raw count
    counts['raw_count'] = list(raw_counts['raw_count'])
    
    plt.figure(figsize = (14, 10))
    # Scatter plot sized by percent
    plt.scatter(counts[x], counts[y], edgecolor = 'k', color = 'lightgreen',
                s = 100 * np.sqrt(counts['raw_count']), marker = 'o',
                alpha = 0.6, linewidth = 1.5)
    
    if annotate:
        # Annotate the plot with text
        for i, row in counts.iterrows():
            # Put text with appropriate offsets
            plt.annotate(xy = (row[x] - (1 / counts[x].nunique()), 
                               row[y] - (0.15 / counts[y].nunique())),
                         color = 'navy',
                         s = f"{round(row['percent'], 1)}%")
        
    # Set tick marks
    plt.yticks(counts[y].unique())
    plt.xticks(counts[x].unique())
    
    # Transform min and max to evenly space in square root domain
    sqr_min = int(np.sqrt(raw_counts['raw_count'].min()))
    sqr_max = int(np.sqrt(raw_counts['raw_count'].max()))
    
    # 5 sizes for legend
    msizes = list(range(sqr_min, sqr_max,
                        int(( sqr_max - sqr_min) / 5)))
    markers = []
    
    # Markers for legend
    for size in msizes:
        markers.append(plt.scatter([], [], s = 100 * size, 
                                   label = f'{int(round(np.square(size) / 100) * 100)}', 
                                   color = 'lightgreen',
                                   alpha = 0.6, edgecolor = 'k', linewidth = 1.5))
        
    # Legend and formatting
    plt.legend(handles = markers, title = 'Counts',
               labelspacing = 3, handletextpad = 2,
               fontsize = 16,
               loc = (1.10, 0.19))
    
    plt.annotate(f'* Size represents raw count while % is for a given y value.',
                 xy = (0, 1), xycoords = 'figure points', size = 10)
    
    # Adjust axes limits
    plt.xlim((counts[x].min() - (6 / counts[x].nunique()), 
              counts[x].max() + (6 / counts[x].nunique())))
    plt.ylim((counts[y].min() - (4 / counts[y].nunique()), 
              counts[y].max() + (4 / counts[y].nunique())))
    plt.grid(None)
    plt.xlabel(f"{x}"); plt.ylabel(f"{y}"); plt.title(f"{y} vs {x}");


In [None]:
heads['hhsize-diff'] = heads['tamviv'] - heads['hhsize']

In [None]:
corr_matrix.loc[corr_matrix['coopele'].abs() > 0.9, corr_matrix['coopele'].abs() > 0.9]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

In [None]:
# Labels for training
train_labels = np.array(list(train[train['Target'].notnull()]['Target'].astype(np.uint8)))

# Extract the training data
train_set = train[train['Target'].notnull()].drop(columns = ['Id', 'idhogar', 'Target'])
test_set = test[test['Target'].isnull()].drop(columns = ['Id', 'idhogar', 'Target'])

# Submission base which is used for making submissions to the competition
submission_base = test[['Id', 'idhogar']].copy()

In [None]:
features = list(train_set.columns)

pipeline = Pipeline([('imputer', Imputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])

# Fit and transform training data
train_set = pipeline.fit_transform(train_set)
test_set = pipeline.transform(test_set)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=10, 
                               n_jobs = -1)
# 10 fold cross validation
cv_score = cross_val_score(model, train_set, train_labels, cv = 10, scoring = scorer)

print(f'10 Fold Cross Validation F1 Score = {round(cv_score.mean(), 4)} with std = {round(cv_score.std(), 4)}')