In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import scipy.stats as stats

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_train.shape

In [None]:
df_test = pd.read_csv('../input/test.csv')
df_test.shape

In [None]:
df = pd.concat((df_train, df_test), ignore_index= True, sort = False)
df.shape

Check the number of household heads exist in the train, test datasets.

In [None]:
unique_hh_heads = (df_train['parentesco1'] == 1).sum()
unique_hh = len(df_train['idhogar'].unique())

print ('There are {} unique households and the dataset contain {} records of household heads'.format(unique_hh, unique_hh_heads))
print ('There is {} households without household head'.format(unique_hh - unique_hh_heads))

In [None]:
unique_hh_heads = (df_test['parentesco1'] == 1).sum()
unique_hh = len(df_test['idhogar'].unique())

print ('There are {} unique households and the dataset contain {} records of household heads'.format(unique_hh, unique_hh_heads))
print ('There is {} households without household head'.format(unique_hh - unique_hh_heads))

In [None]:
df.info(verbose=False)

We can see that there is nine columns with dtype float, five with dtype object and 129 with dtype int.

In [None]:
df.columns[df.dtypes == object]

`dependency`, `edjefe` and `edjefa` should be `numeric`, rather than `object`

In [None]:
df.columns[df.dtypes == float]

In [None]:
# check columns for null values
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
df['Target'].value_counts(normalize = True)

In [None]:
sns.countplot(x = 'Target', data = df)

In [None]:
hh_heads = set(df['idhogar'][df['parentesco1'] == 1])
households = set(df['idhogar'])

In [None]:
'''
missing_hh =  households.difference(hh_heads)
rows_to_delete = df[df['idhogar'].isin(missing_hh)].index
df.drop(index= rows_to_delete, inplace = True) '''

### EDA

#### Monthly Rent

In [None]:
# Number of records with no rent amount
df['v2a1'].isnull().sum()

Whether or not rent is applicable for an household depends on the ownership type of the house. Let's obtain how the household ownership is distributed in the combined dataset

In [None]:
# 
col = [i for i in df.columns if i.startswith('tipovivi')]
df.loc[:, col].sum()

In [None]:
# Create temporary column to identify the home ownership type.
df['temp_tipovivi'] = df[col].idxmax(axis = 1)

In [None]:
## Identify the home ownership status of the hh with zero rent
df['temp_tipovivi'][df['v2a1'].isnull()].value_counts()

The NA's only occure when the house ownership is own house('tipovivi1'), xxxxx ('tipovivi5') or xxx ('tipovivi4'). We can guess that the NA's are due to no rent being applicable for such households. So we can fill out the NA values with 0

In [None]:
df['v2a1'].fillna(value = 0, inplace = True)

We should also verify that the zero rent only occure in households when the ownership type is in 'tipovivi1', 'tipovivi5', 'tipovivi4'.

In [None]:
df['temp_tipovivi'][df['v2a1'] == 0].value_counts()

As the above output shows there are 97 households that have zero rent, but the house ownership is recorded as 'tipovivi2' and 'tipovivi3'. However, zero rent implies that the the household does not pay rent so we should change the the home ownership type to be consistant.

In [None]:
# Change the homeownership type to  be consistant with the rent amount.
tipovivi2 = (df['v2a1'] == 0)&(df['tipovivi2'] == 1)
tipovivi3 = (df['v2a1'] == 0)&(df['tipovivi3'] == 1)

In [None]:
df.loc[tipovivi2,'tipovivi1'] = 1
df.loc[tipovivi3,'tipovivi1'] = 1

df.loc[tipovivi2,'tipovivi2'] = 0
df.loc[tipovivi3,'tipovivi3'] = 0

In [None]:
## Update temp_tipovivi to reflect change
df['temp_tipovivi'] = df[col].idxmax(axis = 1)

In [None]:
df[col][df['v2a1'] == 0].sum()

In [None]:
sns.distplot(df['v2a1'],fit = stats.norm)

In [None]:
## Seperate out the records where the households does not pay rent
df['RentPaying'] = (df['v2a1'] > 0)*1
## Log transfrom to make distribution normal
df['v2a1'] = np.log1p(df['v2a1'])
sns.distplot(df['v2a1'][df['RentPaying'] == 1],fit = stats.norm)

In [None]:
df.pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_tipovivi', aggfunc= 'count', margins= True)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_tipovivi', aggfunc= 'count')
cat = df['temp_tipovivi'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

This suggest that where the household owns and fully paid house, they are more likely (86%) to be in 'non-vulnerable household' and only 2% of families having 'own and fully paid' houses tend to live in extreme poverty.

#### Tablet Ownership

In [None]:
df['v18q'][df['v18q1'].isnull()].value_counts()

We can see that `NA` occure in the `v18q1` only when the household does not own any tablets. We can easily fill out these `NA`s with zeros.

In [None]:
df['v18q1'].fillna(value = 0, inplace = True)

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['v18q'].mean()

We can see that the likelihood of a household owning a tablet increases as their income level increases.

In [None]:
temp = df[(df['parentesco1'] == 1)&(df['Target'].notnull())].pivot_table(index = 'Target', columns = 'v18q1', values = 'idhogar', aggfunc='count')
cat = df['v18q1'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Schooling 

In [None]:
#### Years behind in school
df['rez_esc'].value_counts(dropna = False)

For a vast majority of instances the 'rez_esc' is null. We can guess that the years behind in school is related to the age of the individual, so let's check when this value is applicable.

In [None]:
df['age'][df['rez_esc'].notna()].value_counts().sort_index()

So the years behind in school variable is only applicable for children whose age is between 7 and 18 (< 18). i.e the typical school going age range. We can simply fill in the NAs with 0.

In [None]:
df['rez_esc'].fillna(value = 0, inplace = True)

In [None]:
## Fixing the large age behind in school value
df.loc[df['rez_esc'] > 50, 'rez_esc'] = 0

#### Mean Education

In [None]:
## Obtain a list of households where the average years schooled is NA
na_mean_households = df['idhogar'][df['meaneduc'].isna()].unique()

In [None]:
## Checking if there are 18+ persons in households 
df[df['meaneduc'].isna()].groupby('idhogar')['age'].max()

The `meaneduc` is calculated by taking the mean of individuals aged 18 or above. Out of the households that have missing `meaneduc` values all except two households have individuals aged 18 or above. For the households c31f9f3a0 and c49af2e64 we do not have any individuals over the age of 18. So we'll set the `meaneduc` value to zero for these households. For the rest we can recompute the applicable value.

In [None]:
## recompute meaneduc for households.
mapper = df[df['age'] >= 18].groupby('idhogar')['escolari'].mean().to_dict()

In [None]:
df['meaneduc'] = df.apply(lambda x: mapper.get('idhogar', 0) if np.isnan(x['meaneduc']) else x['meaneduc'], axis = 1)

In [None]:
df['meaneduc'].isna().sum()

In [None]:
df['SQBmeaned'] = df['meaneduc']**2

#### Overcrowding

In [None]:
sns.countplot(x = 'Target', data = df[df['hacdor'] == 1])

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['hacdor'].mean()

In [None]:
## Since overcrowing occures only ~3% of the time this is a possible candidates for deletion
## df.drop(column = ['hacdor', 'hacapo'])

#### Rooms

In [None]:
### IGNORE !!!
### For the time being we'll calculate the likelihood based on the household head only.
temp = df[df['Target'].notnull()].pivot_table(index = 'Target', columns = 'v18q1', values = 'idhogar', aggfunc='count')
cat = df['v18q1'][df['Target'].notnull()].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Household Members

The data set several features to give the number of individuals in the household:
* tamhog (size of the household)
* tamviv (number of persons living in the household)
* r4t3 (Total persons in the household)
* hhsize (household size)
* hogar_total (# of total individuals in the household)

In [None]:
df[['tamhog', 'tamviv', 'r4t3', 'hhsize', 'hogar_total']][df['r4t3'] != df['hhsize']].head()

In [None]:
df.drop(columns= ['tamhog', 'hogar_total', 'r4t3'], inplace = True)

#### Bathroom

In [None]:
(df['v14a'][df['parentesco1']== 1]).mean()

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['v14a'].mean()

If we look at each income level the likelyhood of the household having an bathroom changes only slighly between the income levels. 

#### Refrigerator 

In [None]:
(df['refrig'][df['parentesco1']== 1]).mean()

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['refrig'].mean()

#### Tablet Ownership

#### outside wall

In [None]:
col = [i for i in df.columns if i.startswith('pared')]
df.loc[:, col].sum()

In [None]:
df['temp_pared'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_pared', aggfunc= 'count')
cat = df['temp_pared'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Floor

In [None]:
col = [i for i in df.columns if i.startswith('piso')]
df.loc[:, col].sum()

In [None]:
df['temp_piso'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_piso', aggfunc= 'count')
cat = df['temp_piso'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Roof

In [None]:
col = [i for i in df.columns if i.startswith('techo')]
df.loc[:, col].sum()

In [None]:
df['temp_techo'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_techo', aggfunc= 'count')
cat = df['temp_techo'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### ceiling

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['cielorazo'].mean()

#### Plumbing

In [None]:
col = [i for i in df.columns if i.startswith('abastagua')]
df.loc[:, col].sum()

In [None]:
df['temp_abastagua'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_abastagua', aggfunc= 'count')
cat = df['temp_abastagua'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Electricity

In [None]:
df['temp_electricity'] = df[['public', 'planpri', 'noelec', 'coopele']].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_electricity', aggfunc= 'count')
cat = df['temp_electricity'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Toilet

In [None]:
col = [i for i in df.columns if i.startswith('sanit')]
df.loc[:, col].sum()

In [None]:
df['temp_sanitario'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_sanitario', aggfunc= 'count')
cat = df['temp_sanitario'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Cooking

In [None]:
col = [i for i in df.columns if i.startswith('energcocinar')]
df.loc[:, col].sum()

In [None]:
df['temp_energcocinar'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_energcocinar', aggfunc= 'count')
cat = df['temp_energcocinar'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

In [None]:
df['cooking_lowEng'] = ((df['energcocinar1'] == 1)|(df['energcocinar4'] == 1))*1

#### Garbage Disposal

In [None]:
col = [i for i in df.columns if i.startswith('elimbasu')]
df.loc[:, col].sum()

In [None]:
df['temp_elimbasu'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_elimbasu', aggfunc= 'count')
cat = df['temp_elimbasu'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Wall Condition

In [None]:
col = [i for i in df.columns if i.startswith('epared')]
df.loc[:, col].sum()

In [None]:
df['temp_epared'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_epared', aggfunc= 'count')
cat = df['temp_epared'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Roof Quality

In [None]:
col = [i for i in df.columns if i.startswith('etecho')]
df.loc[:, col].sum()

In [None]:
df['temp_etecho'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_etecho', aggfunc= 'count')
cat = df['temp_etecho'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Floor Quality

In [None]:
col = [i for i in df.columns if i.startswith('eviv')]
df.loc[:, col].sum()

In [None]:
df['temp_eviv'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_eviv', aggfunc= 'count')
cat = df['temp_eviv'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### 

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['dis'].mean()

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['male'].mean()

In [None]:
df.drop(columns= 'female', inplace = True)

#### Civil Status

In [None]:
col = [i for i in df.columns if i.startswith('estadocivil')]
df.loc[:, col].sum()

In [None]:
df['temp_estadocivil'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_estadocivil', aggfunc= 'count')
cat = df['temp_estadocivil'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Level of Education

In [None]:
col = [i for i in df.columns if i.startswith('instlevel')]
df.loc[:, col].sum()

In [None]:
df['temp_instlevel'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_instlevel', aggfunc= 'count')
cat = df['temp_instlevel'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/(cat.T), vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

#### Region

In [None]:
col = [i for i in df.columns if i.startswith('lugar')]
df.loc[:, col].sum()

In [None]:
df['temp_lugar'] = df[col].idxmax(axis = 1)

In [None]:
temp = df[df['parentesco1'] == 1 ].pivot_table(values = 'idhogar' , index = 'Target', columns = 'temp_lugar', aggfunc= 'count')
cat = df['temp_lugar'][(df['Target'].notnull())&(df['parentesco1'] == 1)].value_counts()

##np.divide(temp, cat.values)
sns.heatmap(temp/cat.T, vmin= 0, vmax= 1, cmap = 'viridis', annot= True)

In [None]:
df[df['parentesco1'] == 1].groupby('Target')['area1'].mean()

In [None]:
df.drop(columns= 'area2', inplace = True)

#### Dependency

In [None]:
df['hogar_workingAge'] = df['hogar_adul'] - df['hogar_mayor']
df['hogar_dependent'] = df['hogar_nin'] + df['hogar_mayor']

dependency feature 
yes = 1, i.e hogar_workingAge == hogar_dependent
no  = 0, hogar_dependent = 0
8 = inf, hogar_workingAge = 0

In [None]:
## df[['hogar_nin', 'hogar_adul','hogar_mayor', 'hogar_workingAge', 'hogar_dependent','dependency']][df['dependency'] == 'no']

df[['hogar_nin', 'hogar_adul','hogar_mayor', 'hogar_workingAge', 'hogar_dependent','dependency']][df['dependency'] == '8']

In [None]:
df['dependency'] = df['dependency'].replace({'yes': 1, 'no': 0}).astype(float)

#### Years of Education

In [None]:
df['edjefe'] = df['edjefe'].replace({'no': 0, 'yes': 1})
df['edjefa'] = df['edjefa'].replace({'no': 0, 'yes': 1})

In [None]:
df['edjefe'] = df['edjefe'].astype(int)
df['edjefa'] = df['edjefa'].astype(int)

In [None]:
df['median_schooling'] = df['escolari'].groupby(df['idhogar']).transform('median')
df['max_schooling'] = df['escolari'].groupby(df['idhogar']).transform('max')

In [None]:
df['eduForHeadofHH'] = 0
df.loc[(df['parentesco1']== 1), 'eduForHeadofHH'] = df['escolari']

In [None]:
df['eduForHeadofHH'] = df['eduForHeadofHH'].groupby(df['idhogar']).transform('max')

In [None]:
df['SecondaryEduLess'] = ((df[['instlevel1','instlevel2', 'instlevel3', 'instlevel4']] == 1).any(axis = 1)&(df['age'] > 19))*1
df['SecondaryEduMore'] = ((df[['instlevel5','instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']] == 1).any(axis = 1)&(df['age'] > 19))*1

In [None]:
df['MembersWithSecEdu']  = df['SecondaryEduMore'].groupby(df['idhogar']).transform('sum')
df['MembersWithPrimEdu']  = df['SecondaryEduLess'].groupby(df['idhogar']).transform('sum')

In [None]:
df['Educated_Gap'] = (df['MembersWithSecEdu'] - df['MembersWithPrimEdu'])

In [None]:
df['marital_status'] = (((df['estadocivil3'] ==1)|(df['estadocivil4'] == 1))&(df['parentesco1'] == 1))*1

df['marital_status'] = df['marital_status'].groupby(df['idhogar']).transform('max')

In [None]:
df['FemaleHousehold'] = ((df['male'] == 0)&(df['parentesco1'] == 1))*1
df['FemaleHousehold'] = df['FemaleHousehold'].groupby(df['idhogar']).transform('max')

In [None]:
df['phones_percap'] = df['qmobilephone'] / df['tamviv']
df['tablets_percap'] = df['v18q1'] / df['tamviv']
df['rooms_percap'] = df['rooms'] / df['tamviv']
df['rent_percap'] = df['v2a1'] / df['tamviv']

In [None]:
df['minors_ratio'] = df['hogar_nin']/df['tamviv']
df['elder_ratio'] = df['hogar_mayor']/df['tamviv']

In [None]:
df['child_ratio'] = df['r4t1']/ df['tamviv']
df['malefemale_ratio'] = df['r4h3'] -  df['r4m3'] 

In [None]:
df['ismale_only'] =  (df['r4m3'] == 0)*1
df['isfemale_only'] = (df['r4h3'] == 0)*1
df['no_adultmale'] = (df['r4h2'] == 0)*1
df['no_adultfemale'] = (df['r4m2'] == 0)*1

In [None]:
df['rent_per_room'] = df['v2a1']/df['rooms']
df['bedroom_per_room'] = df['bedrooms']/df['rooms']

In [None]:
df['rent_per_room'] = df['v2a1'] / df['rooms']

In [None]:
df['total_disabled'] = df.groupby('idhogar')['dis'].transform(lambda x: x.sum())

In [None]:
df['average_age'] = df.groupby('idhogar')['age'].transform(lambda x: x.mean())

In [None]:
df['disable_ratio'] = df['total_disabled']/df['tamviv']

In [None]:
df['info_accessibility'] = df[['mobilephone', 'television', 'computer', 'v18q']].any(axis = 1)

In [None]:
df.select_dtypes(include = 'number').columns

In [None]:
df.to_csv('./processed.csv')

### Model Creation

In [None]:
df.drop(columns=['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned'], inplace = True)

In [None]:
training_df = df.select_dtypes(include = 'number')[df['Target'].notnull()]
test_df = df.select_dtypes(include = 'number')[df['Target'].isnull()]

In [None]:
training_df.shape

In [None]:
features = [col for col in training_df.columns if col != 'Target']
X, y = training_df[features], training_df['Target']

In [None]:
test_df.drop(columns = 'Target', inplace = True)

In [None]:
from sklearn.metrics import f1_score, make_scorer, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

In [None]:
tree = DecisionTreeClassifier(max_features= 75, class_weight='balanced')
tree.fit(X,y)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=10, max_features= 75, n_jobs = -1 ,class_weight= 'balanced')
cv_score = cross_val_score(model, X, y, cv = 10, scoring = 'f1_macro')
cv_score.mean()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier


In [None]:
ets = []
for i in range(10):
    rf = RandomForestClassifier(random_state=217+i, n_jobs=4, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight= 'balanced')
    ets.append(('rf{}'.format(i), rf)) 

In [None]:
vclf = VotingClassifier(ets, voting= 'soft')

In [None]:
### Score CV results
cv_score = cross_val_score(vclf, X, y, cv= 5, scoring = 'f1_macro')

In [None]:
cv_score.mean()

In [None]:
cv_predict = cross_val_predict(vclf, X, y, cv = 5)

In [None]:
confusion_matrix(y, cv_predict)

In [None]:
f1_score()

In [None]:
vclf = VotingClassifier(ets, voting= 'hard')
cv_score = cross_val_score(vclf, X, y, cv = 5, scoring = 'f1_macro')
cv_score

In [None]:
cv_score.mean()

In [None]:
vclf.fit(X,y)
vclf_hardvoting = vclf.predict(test_df)

In [None]:
len(vclf_hardvoting)


### LGB 

In [None]:
import lightgbm as lgb

In [None]:
##clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.1, objective='multiclass',
                                 random_state=None, silent=True, metric='None', 
                                 n_jobs=4, n_estimators=500, class_weight='balanced',
                                 colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

In [None]:
## cv_score = cross_val_score(clf, X, y, cv = 3, scoring = 'f1_macro')
## cv_score

In [None]:
### prediction = [model].predict(test_df)

In [None]:
submit=pd.DataFrame({'Id': df['Id'][df['Target'].isna()] , 'Target': vclf_hardvoting.astype(int)})

In [None]:
submit['Target'].value_counts(normalize = True)

In [None]:
submit.to_csv('./submission.csv', index= False)

In [None]:
##training_df_hhO = df.select_dtypes(include = 'number')[(df['Target'].notnull())&(df['parentesco1'] == 1)]
##test_df_hhO = df.select_dtypes(include = 'number')[(df['Target'].isnull())&(df['parentesco1'] == 1)]

In [None]:
##features = [col for col in training_df.columns if col != 'Target']
##X_hhO, y_hhO = training_df_hhO[features], training_df_hhO['Target']

In [None]:
##cv_score = cross_val_score(vclf, X_hhO, y_hhO, cv = 5, scoring = 'f1_macro')

In [None]:
##cv_score

In [None]:
## np.array([0.4596173 , 0.42617731, 0.38660971, 0.35653499, 0.37714824]).mean()