In [1]:
import pandas as pd
# from dataprep.eda import create_report

#formatting
from sklearn.preprocessing import OrdinalEncoder #1st attempt to improve results
# from sklearn.preprocessing import OneHotEncoder #1st attempt to improve results

#models
#1.st attempt
from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import SGDClassifier

#automl
from pycaret.classification import *

#metrics
from sklearn.metrics import accuracy_score

## READ DATA
## FORMATTING
- Points/conclusion from EDA jupyternotebook
- Functions used

## PREDICTION ANALYSIS
- PYCARET
- SHAP

## RESULTS
- VIZ
## CONCLUSION

***

# Reading data

In [26]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# Formatting

### Functions used

In [27]:
def convert_data_type(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data type converting
    """
    converted_data = df.astype({'Name': 'string',
                                'Sex': 'category',
                                'Ticket': 'category',
                                'Cabin': 'category',
                                'Embarked': 'category'})
    return converted_data

In [158]:
ordinal_encoder = OrdinalEncoder()

def format_features(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data formatting
        
        -> missing null values
            -> mean
            -> klustering
        -> hot encoding
        -> Drop columns:
            - ['Name'] -> current not being considered
            - ['Sex'] -> OrdinalEncoder applied and replace by _sex_encoded
            - ['Ticket'] -> OrdinalEncoder applied and replace by _ticket_encoded
    """
    
    #hanlding missing values
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Age'].mean())

    #handling missing values (with mode) -> temporary approach (easiest and faster)
    #-- Cabin
    cabin_mode = df['Cabin'].value_counts().index[0]
    df['Cabin'].fillna(cabin_mode, inplace=True)
    #-- Embarked
    embarked_mode = df['Embarked'].value_counts().index[0]
    df['Embarked'].fillna(embarked_mode, inplace=True)

    #encoder OrdinalEncode
    df['_sex_encoded'] = ordinal_encoder.fit_transform(df[['Sex']])
    df['_ticket_encoded'] = ordinal_encoder.fit_transform(df[['Ticket']])
    df['_cabin_encoded'] = ordinal_encoder.fit_transform(df[['Cabin']])
    df['_embarked_encoded'] = ordinal_encoder.fit_transform(df[['Embarked']])

    #dropping columns
    df = df.drop(columns={'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'})

    return df

In [159]:
def format_test_data(df_test: pd.DataFrame) -> pd.DataFrame:
    """Specific fuction to handle with test dataset

    Args:
        df (pd.DataFrame): test dataset

    Returns:
        pd.DataFrame: formatead test dataset
    """

    converted_test_data = convert_data_type(df_test)
    formated_test_data =  format_features(converted_test_data)

    return formated_test_data


### Train 

In [160]:
converted_train_data = convert_data_type(train_data)
formated_train_data = format_features(converted_train_data)

In [161]:
formated_train_data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,_sex_encoded,_ticket_encoded,_cabin_encoded,_embarked_encoded
0,1,0,3,22.000000,1,0,7.2500,1.0,523.0,63.0,2.0
1,2,1,1,38.000000,1,0,71.2833,0.0,596.0,81.0,0.0
2,3,1,3,26.000000,0,0,7.9250,0.0,669.0,63.0,2.0
3,4,1,1,35.000000,1,0,53.1000,0.0,49.0,55.0,2.0
4,5,0,3,35.000000,0,0,8.0500,1.0,472.0,63.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,1.0,101.0,63.0,2.0
887,888,1,1,19.000000,0,0,30.0000,0.0,14.0,30.0,2.0
888,889,0,3,29.699118,1,2,23.4500,0.0,675.0,63.0,2.0
889,890,1,1,26.000000,0,0,30.0000,1.0,8.0,60.0,0.0


### Test

In [162]:
formated_test_data = format_test_data(test_data)

In [163]:
formated_test_data

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,_sex_encoded,_ticket_encoded,_cabin_encoded,_embarked_encoded
0,892,3,34.50000,0,0,7.8292,1.0,152.0,15.0,1.0
1,893,3,47.00000,1,0,7.0000,0.0,221.0,15.0,2.0
2,894,2,62.00000,0,0,9.6875,1.0,73.0,15.0,1.0
3,895,3,27.00000,0,0,8.6625,1.0,147.0,15.0,2.0
4,896,3,22.00000,1,1,12.2875,0.0,138.0,15.0,2.0
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,1.0,267.0,15.0,2.0
414,1306,1,39.00000,0,0,108.9000,0.0,324.0,22.0,0.0
415,1307,3,38.50000,0,0,7.2500,1.0,346.0,15.0,2.0
416,1308,3,30.27259,0,0,8.0500,1.0,220.0,15.0,2.0


### Creating model
Simple model after some cleaning based on EDA analysis

In [33]:
# train_data.head()

In [164]:
x_train = formated_train_data.drop({'Survived'}, axis=1)
y_train = formated_train_data[['Survived']]

In [165]:
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(x_train, y_train)

***

### Writing test dataset to submit

In [166]:
x_test = formated_test_data.copy()
y_pred = decision_tree.predict(x_test)

In [167]:
submission_file = pd.DataFrame({'PassengerId': test_data.PassengerId,
                               'Survived': y_pred})

In [168]:
submission_file.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [169]:
submission_file.Survived.value_counts()

0    330
1     88
Name: Survived, dtype: int64

In [148]:
submission_file.to_csv('../data/submission/submission.csv', index=False)