In [18]:
import os
#IMPORT DATASET FROM KAGGLE
os.environ['KAGGLE_USERNAME'] = "XXX" # username from the json file
os.environ['KAGGLE_KEY'] = "XXXX" # key from the json file
#!kaggle competitions download -c titanic

In [1]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def extraiFamilia(nome):
    return nome.split(',')[0].strip()    

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True, excluirFamily=True):
        self.excluirName = excluirName
        self.excluirFamily = excluirFamily
    def fit(self, X, y=None):
        #Identificação da familia
        X["Family"] = X["Name"]

        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        
        if self.excluirFamily:
            self.colunasIndesejadas.append('Family')
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        if 'Family' not in self.colunasIndesejadas:
            Xdrop['Family'] = Xdrop['Family'].apply(extraiFamilia)
        return Xdrop


In [6]:
#Teste
atributosDesejados = AtributosDesejados(excluirName=False,excluirFamily=False)
a = atributosDesejados.fit_transform(X)
a.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,3,Mr,male,22.0,1,0,7.25,S,Braund
1,1,Mrs,female,38.0,1,0,71.2833,C,Cumings
2,3,Miss,female,26.0,0,0,7.925,S,Heikkinen
3,1,Mrs,female,35.0,1,0,53.1,S,Futrelle
4,3,Mr,male,35.0,0,0,8.05,S,Allen


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier(n_jobs=-1))
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'atributosDesejados__excluirFamily': [True, False],
    'classificador__max_depth': [15],
    #'classificador__criterion': ['gini', 'entropy'],
    'classificador__n_estimators': [100]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y, cv=RepeatedKFold(),scoring="roc_auc")
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m


(array([0.87380051, 0.88043053, 0.87126437, 0.87106753, 0.86475989,
        0.85775862, 0.90459046, 0.86181006, 0.87703092, 0.84902686,
        0.86370224, 0.91852058, 0.87482419, 0.89706463, 0.8160992 ,
        0.85659824, 0.89457908, 0.87358747, 0.84869033, 0.86614481,
        0.87817965, 0.88910824, 0.84414239, 0.8980615 , 0.85802219,
        0.84287302, 0.82473262, 0.92062656, 0.90397609, 0.88221925,
        0.8327654 , 0.86944901, 0.86339744, 0.89026915, 0.86997445,
        0.85434783, 0.87581169, 0.84331551, 0.88785461, 0.86461864,
        0.86299975, 0.89001323, 0.89669421, 0.84840027, 0.83261579,
        0.85022057, 0.84770408, 0.92732285, 0.83495551, 0.88870607]),
 0.8698945617526356,
 0.024502125266948345)

In [None]:
test["Family"] = test["Name"].map(lambda x: x.split(',')[0].strip())

In [15]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
  self._final_estimator.fit(Xt, y, **fit_params)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
  self._final_estimator.fit(Xt, y, **fit_params)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernela

In [19]:
!kaggle competitions submit -c titanic -f submission.csv -m "new submission"

100% 2.77k/2.77k [00:01<00:00, 2.43kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

Foram feitas diversas tentativas em outro notebook, no qual adicionei novas colunas calculadas, sendo elas, família, quantidade de parentes, se pessoa sozinha e o deck da cabine da pessoa. Os resultados foram:

Adicionado colunas alone e relativies:
- média test-score: 0.8306295901073381
- Kaggle submission: 0.78468


Adicionando coluna Deck:
- média test-score: 0.8280679178959263
- Kaggle submission: 0.78708


Porém, as opções que adicionei ao GridSearchCV tornaram o processo muito lento para novas tentativas. Diante disso preparei este notebook e fazendo a criação da coluna Família, bem como a inclusão de novos parâmetros no GridSearchCV (scoring) bem como parâmetros de busca no modelo, cheguei ao resultado:

- média test-score: 0.8698945617526356
- Kaggle submission: 0.79425

