# Titanic Kaggle
Nessa arquivo vamos criar um algoritmo para tentar prever a sobrevivência de alguns passageiros do Titanic.<br>
Vamos utilizar 2 datasets disponíveis no kaggle, um para treinamento da rede neural e outro para teste, para resolver esse problema vamos testar 3 tecnicas, Regressão logística, Gradient boosting, Decision Tree e Random Forest. <br><br>
## Importando Bibliotecas 

In [1]:
#importa pandas
import pandas as pd
#importa matplotlib (graficos)
import matplotlib.pyplot as plt
#importa numpy
import numpy as np

#importa modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

  from numpy.core.umath_tests import inner1d


## importando Datasets

In [2]:
df = pd.read_csv("train.csv")
teste = pd.read_csv("test.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preparando dados

In [3]:
#descrição basica dos dados
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
#conta missings 
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
#preenchendo idade com idade média
df['Age'] = df['Age'].fillna(df['Age'].median())
#conta missings 
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
#extraindo titulos dos nomes
titulo = set()
for name in df['Name']:
    titulo.add(name.split(',')[1].split('.')[0].strip())
#mostra titulos
titulo

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

Temos muitos titulos diferentes, vamos diminuir esse numero agrupando alguns deles

In [7]:
#dicionario para titulos
grupo_titulo= {
    "Capt": "Oficial",
    "Col": "Oficial",
    "Major": "Oficial",
    "Jonkheer": "Nobre",
    "Don": "Nobre",
    "Sir" : "Nobre",
    "Dr": "Oficial",
    "Rev": "Oficial",
    "the Countess":"Nobre",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Nobre",
    "Dona" : "Mrs"
}

#extrai titulos e cria nova coluna
df['Titulo'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())

#altera titulos
df['Titulo'] = df.Titulo.map(grupo_titulo)

In [8]:
#mostra Dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titulo
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


Pronto, nova coluna com os titulos criada, agora vamos criar mais algumas colunas e transformar o titulo em uma variavel dummy.

In [9]:
#cria uma coluna para cada titulo
titulo_dummies = pd.get_dummies(df['Titulo'], prefix='Titulo')
df = pd.concat([df, titulo_dummies], axis=1)

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titulo,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,0,0,1,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,0,0,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0,1,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,0,0,0,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0,0,1,0,0,0


Agora podemos remover as colunas que não iremos usar mais

In [11]:
df.drop(['Name','Titulo'], axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,0,0
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0,1,0,0
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,0,0
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,0,0,0,1,0,0
4,5,0,3,male,35.0,0,0,373450,8.05,,S,0,0,1,0,0,0


Agora vamos fazer algo parecido com o que fizemos para o titulo com o local de embarque, criar novas colunas com variaveis dummys

In [12]:
#preenche embarque nulos com S
df['Embarked'] = df['Embarked'].fillna("S")
#cria uma coluna para cada local embarque
Emb_dummies = pd.get_dummies(df['Embarked'], prefix='Emb')
df = pd.concat([df, Emb_dummies], axis=1)
#apaga coluna "Embarked"
df.drop('Embarked', axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial,Emb_C,Emb_Q,Emb_S
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,0,0,1
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,0,0,0,1,0,0,1,0,0
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,0,0,1
3,4,1,1,female,35.0,1,0,113803,53.1,C123,0,0,0,1,0,0,0,0,1
4,5,0,3,male,35.0,0,0,373450,8.05,,0,0,1,0,0,0,0,0,1


Como temos poucos dados relacionados a cabine dos passageiros, vou remover eles do nosso modelo.<br>
Também vamos desconsiderar o Ticket para esse modelo.

In [14]:
#apaga coluna "cabin"
df.drop('Cabin', axis=1, inplace=True)
#apaga coluna "Ticket"
df.drop('Ticket', axis=1, inplace=True)

Agora vamos substituir o Sex por um valor numerico.

In [15]:
df['Sex'] = df['Sex'].map({'male':1, 'female':0})

In [16]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial,Emb_C,Emb_Q,Emb_S
0,1,0,3,1,22.0,1,0,7.25,0,0,1,0,0,0,0,0,1
1,2,1,1,0,38.0,1,0,71.2833,0,0,0,1,0,0,1,0,0
2,3,1,3,0,26.0,0,0,7.925,0,1,0,0,0,0,0,0,1
3,4,1,1,0,35.0,1,0,53.1,0,0,0,1,0,0,0,0,1
4,5,0,3,1,35.0,0,0,8.05,0,0,1,0,0,0,0,0,1


Da mesma forma que criamos colunas com variaveis dummys para o tiulo e local de embarque, vamos fazer isso para a classe.

In [17]:
#cria uma coluna para cada local embarque
Classe_dummies = pd.get_dummies(df['Pclass'], prefix='Classe')
df = pd.concat([df, Classe_dummies], axis=1)

In [18]:
#Apaga a coluna "Pclass"
df.drop('Pclass', axis=1, inplace=True)

Vamos criar uma nova coluna com o tamanho da familia, para isso vamos utilizar a coluna "SibSp" e "Parch"

In [19]:
#soma coluna "SibSp" com coluna "Parch"  e adicona 1 para criar o tamanho da familia
df['Tamanho_Familia'] = df['SibSp'] +  df['Parch'] + 1

In [20]:
#Apaga a coluna "SibSp" e "Parch"
df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial,Emb_C,Emb_Q,Emb_S,Classe_1,Classe_2,Classe_3,Tamanho_Familia
0,1,0,1,22.0,7.25,0,0,1,0,0,0,0,0,1,0,0,1,2
1,2,1,0,38.0,71.2833,0,0,0,1,0,0,1,0,0,1,0,0,2
2,3,1,0,26.0,7.925,0,1,0,0,0,0,0,0,1,0,0,1,1
3,4,1,0,35.0,53.1,0,0,0,1,0,0,0,0,1,1,0,0,2
4,5,0,1,35.0,8.05,0,0,1,0,0,0,0,0,1,0,0,1,1


Vamos criar mais uma coluna para verificar se a pessoa esta viajando sozinha ou não.

In [21]:
#cria coluna
df['Sozinho'] = 0

df.loc[df['Tamanho_Familia'] == 1, 'Sozinho'] = 1

In [22]:
df.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Nobre,Titulo_Oficial,Emb_C,Emb_Q,Emb_S,Classe_1,Classe_2,Classe_3,Tamanho_Familia,Sozinho
0,1,0,1,22.0,7.25,0,0,1,0,0,0,0,0,1,0,0,1,2,0
1,2,1,0,38.0,71.2833,0,0,0,1,0,0,1,0,0,1,0,0,2,0
2,3,1,0,26.0,7.925,0,1,0,0,0,0,0,0,1,0,0,1,1,1
3,4,1,0,35.0,53.1,0,0,0,1,0,0,0,0,1,1,0,0,2,0
4,5,0,1,35.0,8.05,0,0,1,0,0,0,0,0,1,0,0,1,1,1


In [23]:
#preenchendo idade com idade média
teste['Age'] = teste['Age'].fillna(teste['Age'].median())
#extraindo titulos dos nomes
titulo = set()
for name in teste['Name']:
    titulo.add(name.split(',')[1].split('.')[0].strip())

grupo_titulo= {
    "Capt": "Oficial",
    "Col": "Oficial",
    "Major": "Oficial",
    "Jonkheer": "Nobre",
    "Don": "Nobre",
    "Sir" : "Nobre",
    "Dr": "Oficial",
    "Rev": "Oficial",
    "the Countess":"Nobre",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Nobre",
    "Dona" : "Mrs"
}

#extrai titulos e cria nova coluna
teste['Titulo'] = teste['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())

#altera titulos
teste['Titulo'] = teste.Titulo.map(grupo_titulo)

#cria uma coluna para cada titulo
titulo_dummies = pd.get_dummies(teste['Titulo'], prefix='Titulo')
teste = pd.concat([teste, titulo_dummies], axis=1)
teste['Titulo_Nobre'] = 0

#remove colunas
teste.drop(['Name','Titulo'], axis=1, inplace=True)

#preenche embarque nulos com S
teste['Embarked'] = teste['Embarked'].fillna("S")

#cria uma coluna para cada local embarque
Emb_dummies = pd.get_dummies(teste['Embarked'], prefix='Emb')
teste = pd.concat([teste, Emb_dummies], axis=1)

#apaga coluna "Embarked"
teste.drop('Embarked', axis=1, inplace=True)

#apaga coluna "cabin"
teste.drop('Cabin', axis=1, inplace=True)

#apaga coluna "Ticket"
teste.drop('Ticket', axis=1, inplace=True)

teste['Sex'] = teste['Sex'].map({'male':1, 'female':0})

#cria uma coluna para cada local embarque
Classe_dummies = pd.get_dummies(teste['Pclass'], prefix='Classe')
teste = pd.concat([teste, Classe_dummies], axis=1)

#Apaga a coluna "Pclass"
teste.drop('Pclass', axis=1, inplace=True)

#soma coluna "SibSp" com coluna "Parch"  e adicona 1 para criar o tamanho da familia
teste['Tamanho_Familia'] = teste['SibSp'] +  teste['Parch'] + 1

#Apaga a coluna "SibSp" e "Parch"
teste.drop(['SibSp', 'Parch'], axis=1, inplace=True)

#cria coluna
teste['Sozinho'] = 0
teste.loc[df['Tamanho_Familia'] == 1, 'Sozinho'] = 1

#Preenche Fare com média
teste['Fare'] = teste['Fare'].fillna(teste['Fare'].median())

teste.head()

Unnamed: 0,PassengerId,Sex,Age,Fare,Titulo_Master,Titulo_Miss,Titulo_Mr,Titulo_Mrs,Titulo_Oficial,Titulo_Nobre,Emb_C,Emb_Q,Emb_S,Classe_1,Classe_2,Classe_3,Tamanho_Familia,Sozinho
0,892,1,34.5,7.8292,0,0,1,0,0,0,0,1,0,0,0,1,1,0
1,893,0,47.0,7.0,0,0,0,1,0,0,0,0,1,0,0,1,2,0
2,894,1,62.0,9.6875,0,0,1,0,0,0,0,1,0,0,1,0,1,1
3,895,1,27.0,8.6625,0,0,1,0,0,0,0,0,1,0,0,1,1,0
4,896,0,22.0,12.2875,0,0,0,1,0,0,0,0,1,0,0,1,3,1


## Modelagem
Após fazermos todas essas alterações no nosso dataset ele finalmente esta pronto para passar pelos algoritmos.<br>
Vamos testar qual deles se sai melhor com a nossa amostra.

In [24]:
X_train = df.drop(['Survived', 'PassengerId'], axis=1)
y_train = df['Survived']
X_test = teste.drop("PassengerId", axis=1).copy()

X_train.shape, y_train.shape, X_test.shape

((891, 17), (891,), (418, 17))

### Regressão Logística

In [25]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_reg = round( clf.score(X_train, y_train) * 100, 2)
print (str(acc_log_reg) + ' percent')

82.94 percent


### Gradient boosting

In [26]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_gbc = round( clf.score(X_train, y_train) * 100, 2)
print (str(acc_log_reg) + ' percent')

82.94 percent


### Decision Tree

In [27]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred_decision_tree = clf.predict(X_test)
acc_decision_tree = round(clf.score(X_train, y_train) * 100, 2)
print (acc_decision_tree)

98.2


### Random Forest

In [28]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest = clf.predict(X_test)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print (acc_random_forest)

98.09


In [29]:
resultado = pd.DataFrame({ "PassengerId": teste["PassengerId"], "Survived": y_pred_random_forest})
resultado.to_csv('resultado.csv', index=False)