In [22]:
import pandas as pd
# from dataprep.eda import create_report

#formatting
from sklearn.preprocessing import OrdinalEncoder #1st attempt to improve results

#models
#1.st attempt
from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import SGDClassifier

#metrics
from sklearn.metrics import accuracy_score

## READ DATA
## FORMATTING
- Points/conclusion from EDA jupyternotebook
- Functions used

## PREDICTION ANALYSIS
- PYCARET
- SHAP

## RESULTS
- VIZ
## CONCLUSION

***

# Reading data

In [92]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# Formatting

### Functions used

In [5]:
def convert_data_type(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data type converting
    """
    converted_data = df.astype({'Name': 'string',
                                'Sex': 'category',
                                'Ticket': 'category',
                                'Cabin': 'category',
                                'Embarked': 'category'})
    return converted_data

In [83]:
ordinal_encoder = OrdinalEncoder()

def format_features(df: pd.DataFrame) -> pd.DataFrame:
    """Properly data formatting
        
        -> missing null values
            -> mean
            -> klustering
        -> hot encoding
        -> Drop columns:
            - ['Name'] -> current not being considered
            - ['Sex'] -> OrdinalEncoder applied and replace by _sex_encoded
            - ['Ticket'] -> OrdinalEncoder applied and replace by _ticket_encoded
    """
    
    #hanlding missing values
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Age'].mean())

    #handling missing values (with mode) -> temporary approach (easiest and faster)
    #-- Cabin
    cabin_mode = df['Cabin'].value_counts().index[0]
    df['Cabin'].fillna(cabin_mode, inplace=True)
    #-- Embarked
    embarked_mode = df['Embarked'].value_counts().index[0]
    df['Embarked'].fillna(embarked_mode, inplace=True)

    #encoder
    df['_sex_encoded'] = ordinal_encoder.fit_transform(df[['Sex']])
    df['_ticket_encoded'] = ordinal_encoder.fit_transform(df[['Ticket']])
    df['_cabin_encoded'] = ordinal_encoder.fit_transform(df[['Cabin']])
    df['_embarked_encoded'] = ordinal_encoder.fit_transform(df[['Cabin']])

    #dropping columns
    df = df.drop(columns={'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'})

    return df

In [96]:
def format_test_data(df_test: pd.DataFrame) -> pd.DataFrame:
    """Specific fuction to handle with test dataset

    Args:
        df (pd.DataFrame): test dataset

    Returns:
        pd.DataFrame: formatead test dataset
    """

    converted_test_data = convert_data_type(df_test)
    formated_test_data =  format_features(converted_test_data)

    return formated_test_data


### Train 

In [102]:
converted_train_data = convert_data_type(train_data)
formated_train_data = format_features(converted_train_data)

### Test

In [101]:
formated_test_data = format_test_data(test_data)

### Creating model
Simple model after some cleaning based on EDA analysis

In [104]:
# train_data.head()

In [107]:
x_train = formated_train_data.drop({'Survived'}, axis=1)
y_train = formated_train_data[['Survived']]

In [108]:
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(x_train, y_train)

***

### Writing test dataset to submit

In [110]:
x_test = formated_test_data.copy()
y_pred = decision_tree.predict(x_test)

In [111]:
submission_file = pd.DataFrame({'PassengerId': test_data.PassengerId,
                               'Survived': y_pred})

In [112]:
submission_file.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [113]:
submission_file.to_csv('../data/submission/submission-12-12-2022.csv', index=False)