In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Exploratory data analysis

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
train.head()

In [None]:
test.head()

In [None]:
train.describe()

In [None]:
train.isna().sum() / train.shape[0]

In [None]:
train.dtypes

In [None]:
train.groupby('Survived').count()

Feature extractor

In [None]:
def neighbours_survived_percentage(passager, n_neighbours, df):
    
    df['Ticket'] = pd.to_numeric(df['Ticket'], errors='coerce')
    df.sort_values(by=['Ticket'], inplace=True)
    
    if passager + n_neighbours + 1 < df.shape[0]:
        df_neighbours = df[passager+1:passager+n_neighbours+1] 
    else:
        df_neighbours = df[passager - n_neighbours - 1:passager]
        
    survived = df_neighbours[df_neighbours.Survived == 1].shape[0]
    died = df_neighbours[df_neighbours.Survived == 0].shape[0]
    
    return survived / (survived + died)

In [None]:
from sklearn.preprocessing import LabelEncoder

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def features_engineering(df):
    """
    Features engineering
    """
    #df.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)
    df.drop([ 'Name', 'Cabin'], axis=1, inplace=True)
    # keep features by dtypes
    num_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
    cat_cols = list(df.select_dtypes(include=['object']).columns)
    
    # imputation categorical features : most_frequent
    cat_transformer = SimpleImputer(strategy = 'most_frequent')
    cat_cols_impute = pd.DataFrame(cat_transformer.fit_transform(df[cat_cols]), columns = cat_cols)

    # imputation numerical features : median
    num_transformer = SimpleImputer(strategy = 'median')
    num_cols_impute = pd.DataFrame(num_transformer.fit_transform(df[num_cols]), columns = num_cols)

    # merge numeric and categorical after transformation
    X_features = pd.concat([cat_cols_impute, num_cols_impute], axis=1, join='inner')

    # Encode categorical features
    X_features = MultiColumnLabelEncoder(columns = cat_cols).fit_transform(X_features)

    return X_features

In [None]:
train_features = features_engineering(train)
test_features = features_engineering(test)

In [None]:
train_features.head(5)

In [None]:
from sklearn.model_selection import train_test_split

X_train = train_features.drop('Survived', axis=1)
y_train = train_features['Survived']
X_test=test_features

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

> ****

Classifier

In [None]:
## Support Vector Machine ##
    # Lien de l'explication de la fonction sur le package Sk-learn : https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC    
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
svc = SVC(probability=True, C = 1.5, gamma = 'auto', kernel = 'rbf')
svc.fit(X_train, y_train)
print("Accuracy du SVM : %s"%(round(svc.score(X_train, y_train)*100,2)))
print("\n ------------------------------------------------------------ \n")
a=svc.predict(X_test).astype(int)
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": a})
submission.to_csv('Soumission_SVM.csv', index=False)
    # Ce modèle obtient un score de 78.94 % sur l'échantillon test. Il performe donc la regression malgré les paramètres par défaut. Nous allons donc essayer d'améliorer ce modèle en le customisant.

In [None]:
# Mise en place du GRID-SEARCH :
params = {"n" : None, "AUC" : 0, "Acc" : 0, "Soumission" : None}
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for gamma in ['auto', 'scale']:
        for c in [0.5, 0.8, 1, 1.2, 1.5]:
            svc_cust = SVC(probability=True, C = c, gamma = gamma, kernel = kernel)
            svc_cust.fit(X_train, y_train)
        if round(svc_cust.score(X_train, y_train) * 100, 2) > params['Acc'] and round(roc_auc_score(y_train, [x[1] for x in svc_cust.predict_proba(X_train)]),2) > params['AUC']:
            params['parametres'] = [kernel, gamma, c]
            params['AUC'] = round(roc_auc_score(y_train, [x[1] for x in svc_cust.predict_proba(X_train)]),2)
            params['Acc'] = round(svc_cust.score(X_train, y_train) * 100, 2)
            params['Soumission'] = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": svc_cust.predict(X_test)})
            print('Accuracy améliorée : %s'%params['Acc'])
            
print("\n ------------------------------------------------------------ \n")
print("Le meilleur modèle à l'issue du grid search utilise n = %s et obtient en score un AUC de %s et une Accuracy de %s"%(params['parametres'], params['AUC'], params['Acc']))