# Titanic - Machine Learning from Disaster

https://www.kaggle.com/c/titanic/overview

# Importando bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')
sns.set()

In [2]:
print(f'Pandas: {pd.__version__}')
print(f'Numpy: {np.__version__}')
print(f'Seaborn: {sns.__version__}')

Pandas: 1.0.1
Numpy: 1.18.1
Seaborn: 0.10.0


# Importando dados

In [3]:
dir_path = '../data/input/'
train_file = 'train.csv'
test_file = 'test.csv'

In [4]:
df_train = pd.read_csv(dir_path + train_file, index_col = 'PassengerId')
df_test = pd.read_csv(dir_path + test_file, index_col = 'PassengerId')

In [5]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,891,2,,,,681.0,,147,3
top,,,"Kimball, Mr. Edwin Nelson Jr",male,,,,1601.0,,G6,S
freq,,,1,577,,,,7.0,,4,644
mean,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


# Funções

In [8]:
def separar_nome(df):
    df['Title'] = df.Name.str.extract(r' ([A-Za-z]+)\. ', expand = False)
    df['FamilyName'] = df.Name.str.extract(r'([A-Za-z]+),', expand = False)
    return df

def preencher_idade(df):
    df['Age'] = df['Age'].groupby([df['Sex'], df['Title']]).apply(lambda x: x.fillna( np.round(x.mean()) ))
    df['Age'][df.Age.isna() == True] = df['Age'].value_counts().index[0]
    df['Age'] = df['Age'].round(0)
    return df

def preencher_embarque(df):
    df['Embarked'][df.Embarked.isna() == True] = df['Embarked'].value_counts().index[0]
    return df

def preencher_tarifa(df):
    df['Fare'][df.Fare.isna() == True] = df['Fare'].value_counts().index[0]
    return df

def preencher_tam_familia(df):
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    return df

def remover_colunas(df):
    return df.drop(['Ticket','Cabin', 'SibSp', 'Parch', 'Name', 'FamilyName', 'Fare'], axis=1)

def preencher_intervalo_idade(df):
    bins = [0, 12, 18, 45, 60, 80]
    df['AgeBins'] = pd.cut(x = df.Age, bins=bins)
    return df

## Separando o nome

In [9]:
df_train.iloc[0:4].Name

PassengerId
1                              Braund, Mr. Owen Harris
2    Cumings, Mrs. John Bradley (Florence Briggs Th...
3                               Heikkinen, Miss. Laina
4         Futrelle, Mrs. Jacques Heath (Lily May Peel)
Name: Name, dtype: object

In [10]:
# Title
df_train.Name.str.extract(r' ([A-Za-z]+)\. ', expand=False)

PassengerId
1        Mr
2       Mrs
3      Miss
4       Mrs
5        Mr
       ... 
887     Rev
888    Miss
889    Miss
890      Mr
891      Mr
Name: Name, Length: 891, dtype: object

In [11]:
# Family Name
df_train.Name.str.extract(r'([A-Za-z]+),', expand=False)

PassengerId
1         Braund
2        Cumings
3      Heikkinen
4       Futrelle
5          Allen
         ...    
887     Montvila
888       Graham
889     Johnston
890         Behr
891       Dooley
Name: Name, Length: 891, dtype: object

In [12]:
df_train = separar_nome(df_train)
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen


In [13]:
pd.DataFrame(df_train.groupby(['Title'])['Survived'].count())

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Capt,1
Col,2
Countess,1
Don,1
Dr,7
Jonkheer,1
Lady,1
Major,2
Master,40
Miss,182


In [14]:
df_test = separar_nome(df_test)
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,Kelly
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,Wilkes
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,Myles
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,Wirz
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,Hirvonen


## Preenchendo as idades nulas

In [15]:
df_train.Age.isna().sum()

177

In [16]:
# Qual agrupamento não possui idade?
df_train[df_train.Age.isna() == True].groupby(by=['Sex','Title'])['Survived'].count().reset_index()

Unnamed: 0,Sex,Title,Survived
0,female,Miss,36
1,female,Mrs,17
2,male,Dr,1
3,male,Master,4
4,male,Mr,119


In [17]:
df_train = preencher_idade(df_train)
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen


In [18]:
df_test = preencher_idade(df_test)
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,"Kelly, Mr. James",male,34.0,0,0,330911,7.8292,,Q,Mr,Kelly
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,Wilkes
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,Myles
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,Wirz
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,Hirvonen


In [19]:
df_train.Age.isna().sum()

0

In [20]:
df_test.Age.isna().sum()

0

## Preenchendo os Embarques nulos

In [21]:
df_train.Embarked.isna().sum()

2

In [22]:
df_train = preencher_embarque(df_train)
df_train.Embarked.isna().sum()

0

In [23]:
df_test = preencher_embarque(df_test)
df_test.Embarked.isna().sum()

0

## Preenchendo as Tarifas nulas

In [24]:
df_train.Fare.isna().sum()

0

In [25]:
df_test.Fare.isna().sum()

1

In [26]:
df_test = preencher_tarifa(df_test)

In [27]:
df_test.Fare.isna().sum()

0

## Adicionando o tamanho da família

In [28]:
df_train = preencher_tam_familia(df_train)

In [29]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,1


In [30]:
df_test = preencher_tam_familia(df_test)

In [31]:
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilyName,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
892,3,"Kelly, Mr. James",male,34.0,0,0,330911,7.8292,,Q,Mr,Kelly,1
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,Wilkes,2
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,Myles,1
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,Wirz,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,Hirvonen,3


## Removendo algumas colunas não necessárias

In [32]:
df_train = remover_colunas(df_train)

In [33]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Embarked,Title,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,S,Mr,2
2,1,1,female,38.0,C,Mrs,2
3,1,3,female,26.0,S,Miss,1
4,1,1,female,35.0,S,Mrs,2
5,0,3,male,35.0,S,Mr,1


In [34]:
df_test = remover_colunas(df_test)

In [35]:
df_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,Title,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,male,34.0,Q,Mr,1
893,3,female,47.0,S,Mrs,2
894,2,male,62.0,Q,Mr,1
895,3,male,27.0,S,Mr,1
896,3,female,22.0,S,Mrs,3


## Criando  bins para a idade

In [36]:
df_train = preencher_intervalo_idade(df_train)

In [37]:
df_train.AgeBins.value_counts()

(18, 45]    645
(45, 60]     81
(0, 12]      72
(12, 18]     70
(60, 80]     22
Name: AgeBins, dtype: int64

In [38]:
df_test = preencher_intervalo_idade(df_test)

In [39]:
df_test.AgeBins.value_counts()

(18, 45]    305
(45, 60]     42
(12, 18]     32
(0, 12]      27
(60, 80]     10
Name: AgeBins, dtype: int64

# Encoding Dados

In [40]:
df_all = pd.concat([df_train.drop('Survived', axis=1), df_test])

In [41]:
df_all.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,Title,FamilySize,AgeBins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,male,22.0,S,Mr,2,"(18, 45]"
2,1,female,38.0,C,Mrs,2,"(18, 45]"
3,3,female,26.0,S,Miss,1,"(18, 45]"
4,1,female,35.0,S,Mrs,2,"(18, 45]"
5,3,male,35.0,S,Mr,1,"(18, 45]"


In [42]:
encoder = LabelEncoder()
encoder.fit(df_all.Sex)
df_all.Sex = encoder.transform(df_all.Sex)

In [43]:
encoder = LabelEncoder()
encoder.fit(df_all.Embarked)
df_all.Embarked = encoder.transform(df_all.Embarked)

In [44]:
encoder = LabelEncoder()
encoder.fit(df_all.Title)
df_all.Title = encoder.transform(df_all.Title)

In [45]:
df_all.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,Title,FamilySize,AgeBins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22.0,2,13,2,"(18, 45]"
2,1,0,38.0,0,14,2,"(18, 45]"
3,3,0,26.0,2,10,1,"(18, 45]"
4,1,0,35.0,2,14,2,"(18, 45]"
5,3,1,35.0,2,13,1,"(18, 45]"


In [46]:
df_all = pd.concat((df_all,
                    pd.get_dummies(df_all.Sex, prefix = 'Sex'),
                    pd.get_dummies(df_all.Pclass, prefix = 'Pclass'),
                    pd.get_dummies(df_all.Embarked, prefix = 'Embarked'),
                    pd.get_dummies(df_all.Title, prefix = 'Title'),
                    pd.get_dummies(df_all.AgeBins, prefix = 'AgeBin')), axis=1)

In [47]:
df_all.drop(['Pclass', 'Embarked', 'Title', 'AgeBins', 'Age', 'Sex'], axis=1, inplace=True)

In [48]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   FamilySize       1309 non-null   int64
 1   Sex_0            1309 non-null   uint8
 2   Sex_1            1309 non-null   uint8
 3   Pclass_1         1309 non-null   uint8
 4   Pclass_2         1309 non-null   uint8
 5   Pclass_3         1309 non-null   uint8
 6   Embarked_0       1309 non-null   uint8
 7   Embarked_1       1309 non-null   uint8
 8   Embarked_2       1309 non-null   uint8
 9   Title_0          1309 non-null   uint8
 10  Title_1          1309 non-null   uint8
 11  Title_2          1309 non-null   uint8
 12  Title_3          1309 non-null   uint8
 13  Title_4          1309 non-null   uint8
 14  Title_5          1309 non-null   uint8
 15  Title_6          1309 non-null   uint8
 16  Title_7          1309 non-null   uint8
 17  Title_8          1309 non-null   uint8
 18  Title_9 

# Nomeando os dados para treino

In [49]:
x_train = df_all.iloc[:891,]
x_test = df_all.iloc[891:,]

y_train = df_train.Survived

# Encontrando a melhor configuração para o Random Forest Classifier

In [50]:
rf = RandomForestClassifier(max_features='auto', 
                            oob_score=True, 
                            random_state=0, 
                            n_jobs=-1)

In [51]:
param_grid = { "criterion" : ["gini", "entropy"], 
              "min_samples_leaf" : [1, 5, 10], 
              "min_samples_split" : [2, 4, 10, 12, 16], 
              "n_estimators": [50, 100, 400, 700, 1000]}

In [52]:
gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
gs = gs.fit(x_train, y_train)

In [53]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.cv_results_)

0.8327721661054994
{'criterion': 'gini', 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100}
{'mean_fit_time': array([0.22216098, 0.38784814, 1.25788816, 2.2530748 , 3.70146942,
       0.21144756, 0.4046874 , 1.56276735, 3.13927666, 4.06273826,
       0.26916901, 0.47348984, 1.81981746, 3.42363389, 4.25310946,
       0.20511778, 0.45837442, 2.00673564, 2.97311242, 4.24121213,
       0.1938146 , 0.44082038, 2.00408101, 3.60518416, 4.30780848,
       0.21809793, 0.44520831, 2.00672054, 3.49613031, 4.59204451,
       0.29225818, 0.4659605 , 1.91329638, 3.39519477, 4.3961587 ,
       0.21442691, 0.46457458, 1.82478595, 3.38431708, 4.41856901,
       0.20212611, 0.48902504, 1.98036949, 3.37471159, 4.64302039,
       0.24966566, 0.48144984, 1.94712512, 3.38760559, 4.40969944,
       0.26122006, 0.51462301, 1.91329734, 3.36577169, 3.90987523,
       0.25498454, 0.46907814, 1.68116975, 3.23056952, 4.48334138,
       0.25564949, 0.45777567, 1.88628793, 3.3550698 , 4.58075913,
 

# Fazendo a classificação

In [54]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=0,
                             n_jobs=-1)

In [55]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=12,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [56]:
print("%.4f" % rf.oob_score_)

0.8361


In [57]:
pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
1,Sex_0,0.202643
2,Sex_1,0.174728
22,Title_13,0.174274
5,Pclass_3,0.098606
0,FamilySize,0.076857
23,Title_14,0.061183
3,Pclass_1,0.051531
19,Title_10,0.050116
4,Pclass_2,0.02304
27,"AgeBin_(0, 12]",0.020715


# Predições

In [58]:
y_pred = rf.predict(x_test)

In [59]:
prediction_df = pd.DataFrame({
    'PassengerId': x_test.index,
    'Survived': y_pred
})

In [60]:
prediction_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


# Arquivo de saída

In [61]:
saida = '../data/output/submission.csv'

In [62]:
prediction_df.to_csv(saida, index=False)

print(f'File {saida} generated')

File ../data/output/submission.csv generated


# Pontuação

Scoring: 0.78229

Pos: 3758