In [52]:
import pandas as pd
# from dataprep.eda import create_report

#formatting
from sklearn.preprocessing import OrdinalEncoder #1st attempt to improve results
# from sklearn.preprocessing import OneHotEncoder #1st attempt to improve results

#models
#1.st attempt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#automl
from pycaret.classification import *

#metrics
from sklearn.metrics import accuracy_score

## READ DATA
## FORMATTING
- Points/conclusion from EDA jupyternotebook
- Functions used

## PREDICTION ANALYSIS
- PYCARET
- SHAP

## RESULTS
- VIZ
## CONCLUSION

***

# Reading data

In [2]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# Formatting

### Functions used

In [3]:
def convert_data_type(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data type converting
    """
    converted_data = df.astype({'Name': 'string',
                                'Sex': 'category',
                                'Ticket': 'category',
                                'Cabin': 'category',
                                'Embarked': 'category'})
    return converted_data

In [40]:
ordinal_encoder = OrdinalEncoder()

def format_features(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data formatting
        
        -> missing null values
            -> mean
            -> klustering
        -> hot encoding
        -> Drop columns:
            - ['Name'] -> current not being considered
            - ['Sex'] -> OrdinalEncoder applied and replace by _sex_encoded
            - ['Ticket'] -> OrdinalEncoder applied and replace by _ticket_encoded
    """
    
    #hanlding missing values
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Age'].mean())

    #handling missing values (with mode) -> temporary approach (easiest and faster)
    #-- Cabin
    cabin_mode = df['Cabin'].value_counts().index[0]
    df['Cabin'].fillna(cabin_mode, inplace=True)
    #-- Embarked
    embarked_mode = df['Embarked'].value_counts().index[0]
    df['Embarked'].fillna(embarked_mode, inplace=True)

    #encoder OrdinalEncode
    df['_sex_encoded'] = ordinal_encoder.fit_transform(df[['Sex']])
    df['_ticket_encoded'] = ordinal_encoder.fit_transform(df[['Ticket']])
    df['_cabin_encoded'] = ordinal_encoder.fit_transform(df[['Cabin']])
    df['_embarked_encoded'] = ordinal_encoder.fit_transform(df[['Embarked']])
    
    df = df.astype(({'_sex_encoded': 'category',
                     '_ticket_encoded': 'category',
                     '_cabin_encoded': 'category',
                     '_embarked_encoded': 'category'}))

    #dropping columns
    df = df.drop(columns={'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'})

    return df

In [41]:
def format_test_data(df_test: pd.DataFrame) -> pd.DataFrame:
    """Specific fuction to handle with test dataset

    Args:
        df (pd.DataFrame): test dataset

    Returns:
        pd.DataFrame: formatead test dataset
    """

    converted_test_data = convert_data_type(df_test)
    formated_test_data =  format_features(converted_test_data)

    return formated_test_data


### Train 

In [42]:
converted_train_data = convert_data_type(train_data)
formated_train_data = format_features(converted_train_data)

In [43]:
formated_train_data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,_sex_encoded,_ticket_encoded,_cabin_encoded,_embarked_encoded
0,1,0,3,22.000000,1,0,7.2500,1.0,523.0,63.0,2.0
1,2,1,1,38.000000,1,0,71.2833,0.0,596.0,81.0,0.0
2,3,1,3,26.000000,0,0,7.9250,0.0,669.0,63.0,2.0
3,4,1,1,35.000000,1,0,53.1000,0.0,49.0,55.0,2.0
4,5,0,3,35.000000,0,0,8.0500,1.0,472.0,63.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,1.0,101.0,63.0,2.0
887,888,1,1,19.000000,0,0,30.0000,0.0,14.0,30.0,2.0
888,889,0,3,29.699118,1,2,23.4500,0.0,675.0,63.0,2.0
889,890,1,1,26.000000,0,0,30.0000,1.0,8.0,60.0,0.0


### Test

In [44]:
formated_test_data = format_test_data(test_data)

In [45]:
formated_test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,_sex_encoded,_ticket_encoded,_cabin_encoded,_embarked_encoded
0,892,3,34.5,0,0,7.8292,1.0,152.0,15.0,1.0
1,893,3,47.0,1,0,7.0,0.0,221.0,15.0,2.0
2,894,2,62.0,0,0,9.6875,1.0,73.0,15.0,1.0
3,895,3,27.0,0,0,8.6625,1.0,147.0,15.0,2.0
4,896,3,22.0,1,1,12.2875,0.0,138.0,15.0,2.0


### Creating model
Simple model after some cleaning based on EDA analysis

#### quick/test model 

In [46]:
formated_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   PassengerId        891 non-null    int64   
 1   Survived           891 non-null    int64   
 2   Pclass             891 non-null    int64   
 3   Age                891 non-null    float64 
 4   SibSp              891 non-null    int64   
 5   Parch              891 non-null    int64   
 6   Fare               891 non-null    float64 
 7   _sex_encoded       891 non-null    category
 8   _ticket_encoded    891 non-null    category
 9   _cabin_encoded     891 non-null    category
 10  _embarked_encoded  891 non-null    category
dtypes: category(4), float64(2), int64(5)
memory usage: 81.0 KB


In [47]:
x_train = formated_train_data.drop({'Survived'}, axis=1)
y_train = formated_train_data[['Survived']]

In [26]:
#quick model
# decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
# decision_tree = decision_tree.fit(x_train, y_train)

#### Pycaret

In [48]:
#automl - pycaret
s = setup(formated_train_data, target = 'Survived')

Unnamed: 0,Description,Value
0,session_id,8554
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(891, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [49]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8187,0.8717,0.6442,0.8381,0.7267,0.5953,0.6078,0.296
ridge,Ridge Classifier,0.8186,0.0,0.7418,0.7755,0.7557,0.6119,0.615,0.035
rf,Random Forest Classifier,0.8123,0.8611,0.6826,0.7983,0.7312,0.5893,0.5975,0.258
lr,Logistic Regression,0.8122,0.8706,0.7246,0.7698,0.7438,0.5962,0.5995,0.896
et,Extra Trees Classifier,0.8091,0.8592,0.6781,0.7963,0.7284,0.5832,0.5913,0.245
lightgbm,Light Gradient Boosting Machine,0.7929,0.8432,0.6817,0.7505,0.7124,0.5515,0.5548,0.138
dt,Decision Tree Classifier,0.7914,0.7754,0.6909,0.7465,0.7141,0.5509,0.555,0.04
ada,Ada Boost Classifier,0.7898,0.8521,0.6529,0.7563,0.6984,0.5393,0.5443,0.175
lda,Linear Discriminant Analysis,0.7451,0.8176,0.5476,0.719,0.6164,0.4321,0.4443,0.141
knn,K Neighbors Classifier,0.6887,0.7199,0.5085,0.6158,0.5542,0.3185,0.3241,0.067


In [64]:
best

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8554, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [61]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [54]:
model = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=8554, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [55]:
model_classifier = model.fit(x_train, y_train)

***

### Writing test dataset to submit

In [56]:
x_test = formated_test_data.copy()
y_pred = model_classifier.predict(x_test)

In [57]:
submission_file = pd.DataFrame({'PassengerId': test_data.PassengerId,
                               'Survived': y_pred})

In [58]:
submission_file.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [59]:
submission_file.Survived.value_counts()

0    289
1    129
Name: Survived, dtype: int64

In [60]:
submission_file.to_csv('../data/submission/submission.csv', index=False)