# Data Science Brasil - Desafio Kaggle - Titanic

#### Equipe:
    * Ricardo Galiardi 
    * Wanderson Henrique dos Santos
    * Neri Silvestre Filho

### Datasets

#### Treino
    Columns
        * PassengerId: type should be integers
        * Survived: Survived or Not
        * Pclass: Class of Travel
        * Name: Name of Passenger
        * Sex: Gender
        * Age
        * SibSp: Number of Sibling/Spouse aboard
        * Parch: Number of Parent/Child aboard
        * Ticket
        * Fare
        * Cabin
        * Embarked: The port in which a passenger has embarked. C - Cherbourg, S - Southampton, Q = Queenstown
            
#### Teste
    Columns
        * PassengerId
        * Pclass
        * Name
        * Sex
        * Age
        * SibSp
        * Parch
        * Ticket
        * Fare
        * Cabin
        * Embarked
        
#### Conversões
    Columns
        * Sex:      {'female': 0, 'male': 1}
        * Age:      {"Missing": 0, "Infant": 1, "Child": 2, "Teenager": 3, "Adult": 4, "Senior": 5}
        * Fare:     {"Inferior": 0, "Basic": 1, "Superior": 2, "Executive": 3}
        * Title:    {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        * Embarked: {'S': 0, 'C': 1, 'Q': 2}
        
#### Envio
    Columns
        * PassengerId: integer
        * Survived: binary

##### Importando xgboost

### Variable Notes
    * survival
        * 0 = No
        * 1 = Yes
    * pclass: A proxy for socio-economic status (SES)
        * 1: 1st = Upper
        * 2: 2nd = Middle
        * 3: 3rd = Lower
    * age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
    * sibsp: The dataset defines family relations in this way...
        * Sibling = brother, sister, stepbrother, stepsister
        * Spouse = husband, wife (mistresses and fiancés were ignored)
    * parch: The dataset defines family relations in this way...
        * Parent = mother, father
        * Child = daughter, son, stepdaughter, stepson
        * Some children travelled only with a nanny, therefore parch=0 for them.
    * embarked: 
        * C = Cherbourg
        * Q = Queenstown
        * S = Southampton  

In [None]:
#!pip3 install xgboost
#!pip3 install --force-reinstall scikit-learn==0.20rc1

In [None]:
# Lista os arquivos

import os
print(os.listdir())

##### Preparação dos Dados

In [None]:
# Importando os módulos

import numpy as np
import pandas as pd      
import matplotlib.pyplot as plt   
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import Functions as fn
%matplotlib inline       

In [None]:
# Prepara os datasets

dstrain = pd.read_csv('train.csv', names=['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], sep=',', header=0, dtype={'Age': np.float64})
dstest  = pd.read_csv('test.csv',  names=['PassengerId',             'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], sep=',', header=0, dtype={'Age': np.float64})
dsfull = [dstrain, dstest]

In [None]:
# Copy Dataframes

dstraincopy = dstrain.copy()
dstestcopy  = dstest.copy()

In [None]:
# Verificando os primeiros registros

dstrain.head()

In [None]:
# Verificando se existem valores nulos e constituição das variáveis.

#Pclass
print('#Pclass')
print (dstrain[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
print('\n')

#Sex
print('#Sex')
print (dstrain[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
print('\n')

#SibSp and Parch
print('#SibSp and #Parch')
for dataset in dsfull:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
print('Size')
print (dstrain[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

for dataset in dsfull:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
print('Alone')
print (dstrain[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())
print('\n')

#Embarked
print('#Embarked')
for dataset in dsfull:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
print (dstrain[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())
print('\n')

#Fare
print('#Fare')
for dataset in dsfull:
    dataset['Fare'] = dataset['Fare'].fillna(dstrain['Fare'].median())
    
dstrain['CategoricalFare'] = pd.qcut(dstrain['Fare'], 4)
print (dstrain[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())
print('\n')

#Age
print('#Age')
for dataset in dsfull:
    age_avg 	   = dataset['Age'].mean()
    age_std 	   = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
dstrain['CategoricalAge'] = pd.cut(dstrain['Age'], 5)
print (dstrain[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())
print('\n')

#Name
print('#Name')
for dataset in dsfull:
    dataset['Title'] = dataset['Name'].apply(fn.getTitle)

print(pd.crosstab(dstrain['Title'], dstrain['Sex']))

for dataset in dsfull:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print (dstrain[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
print('\n')

In [None]:
# Normalizar

for dataset in dsfull:
    # Mapping Sex
    fn.setSex(dataset, 'Sex')
    
    # Mapping titles
    fn.setTitle(dataset, 'Title')
    
    # Mapping Embarked
    fn.setEmbarked(dataset, 'Embarked')
    
    # Mapping Fare
    fn.setFare(dataset, 'Fare')
    
    # Mapping Age
    fn.setAge(dataset, 'Age')

# Copy Normalization
dscopy = dstrain.copy()

# Feature Selection
#drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']
drop_elements = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']
dstrain = dstrain.drop(drop_elements, axis = 1)
dstrain = dstrain.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
dstest  = dstest.drop(drop_elements, axis = 1)

print (dstrain.head(10))

dsresult = dstrain.copy()
dstrain = dstrain.values
dstest  = dstest.values

In [None]:
# Visualizando a correlação em tabela

# Coeficiente de correlação: 
# +1  = forte correlação positiva
# 0   = não há correlação
# -1  = forte correlação negativa
fn.corr(dsresult)

##### Exploração dos Dados

In [None]:
# Verifica a relação entre a variável preditora x as independentes

#fn.plot(dsresult, 'Survived', 'Pclass')
#fn.plot(dsresult, 'Survived', 'Sex')
#fn.plot(dsresult, 'Survived', 'Age')
#fn.plot(dsresult, 'Survived', 'Fare')
#fn.plot(dsresult, 'Survived', 'Embarked')
fn.plot(dsresult, 'Survived', 'IsAlone')
#fn.plot(dsresult, 'Survived', 'Title')

In [None]:
# Importação dos classificadores

# Suprime erros
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
# Comparação dos classificadores

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(),
    XGBClassifier()]

splits = 12
cols = ["Classifier", "Accuracy"]
acc_dict = {}
log = pd.DataFrame(columns=cols)
X = dstrain[0::, 1::]
y = dstrain[0::, 0]

kfold = KFold(n_splits=splits, random_state=42, shuffle=True)

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=cols)
    log = log.append(log_entry)

In [None]:
# Plot Classifier Accuracy

sns.set(style="darkgrid")
sns.barplot(x='Accuracy', y='Classifier', data=log)

log.groupby(['Classifier', 'Accuracy']).count().sort_values(by=['Accuracy'], ascending=False)

In [None]:
# Prediction

classifier = GradientBoostingClassifier()
classifier.fit(dstrain[0::, 1::], dstrain[0::, 0])
predictions = classifier.predict(dstest)

In [None]:
dsfull

print(len(predictions))
print(dsresult.rowid)
print(pd.DataFrame(dsfull).head())

In [None]:
# Create csv to upload to Kaggle

#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
dssubmission = pd.DataFrame({'PassengerId': dstra['PassengerId'], 'Survived': predictions})

In [None]:
#Visualize the first 5 rows
dssubmission.head()

In [None]:
dssubmission.groupby(['Survived'])['Survived'].count()

In [None]:
fn.plot(dssubmission, 'Survived', 'Survived')

In [None]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'kaggle_titanic.csv'

dssubmission.to_csv(filename, index=False)