# Modelling

In [1]:
import pandas as pd
from joblib import dump, load

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [3]:
def replace_titles(x):
    title=x['title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

def clean_df(df, verbose=False):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df, columns = ["pclass", "name", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked"])
    df.columns = df.columns.str.lower()
    # Drop "boat", "body", "home.dest" and "ticket". The first two hold information if the passenger survived (boat) or if it didn't and the body was recovered (body).
    ### Why is this not working??
    try:
        df = df.drop(["boat", "body", "home.dest", "ticket"], axis=1)
    except KeyError:
        if verbose:
            print("Any of these features are not in the dataframe: boat, body, home.dest, ticket")

    # Just a few observations, drop them
    df = df.drop(df[(pd.isnull(df["embarked"]))].index)

    # Replace NULL with median
    df["fare"] = df["fare"].fillna(df["fare"].median())
    df["age"] = df["age"].fillna(df["age"].median())

    # from: https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/

    # Extract titles
    df["title"] = df["name"].str.extract(r'(Mrs|Mr|Master|Miss|Major|Rev|Dr|Ms|Mlle|Col|Capt|Mme|Countess|Don|Jonkheer)')
    df['title']=df.apply(replace_titles, axis=1)
    df = df.drop("name", axis=1)

    # Extract Deck
    df['deck'] = df["cabin"].str[0].fillna("Unknown")
    df = df.drop("cabin", axis=1)

    # Create new family_size column
    df['family_size'] = df['sibsp']+df['parch'] + 1 #counting the passenger itself
    df['fare_per_person'] = df['fare']/df['family_size']
    df['alone'] = df['family_size'].apply(lambda x: 1 if x==1 else 0)

    # Because why not
    df['age*class'] = df['age']*df['pclass']
####################################### TEST BENCHMARK #######################################
    # dummy categories: "sex", "embarked", "title", "deck"
    sex = ["male", "female"]
    embarked = ["S", "C", "Q"]
    title = ["Mr", "Mrs", "Miss", "Master"]
    deck = ["A", "B", "C", "D", "E", "F", "G", "T", "Unknown"]    

    df["sex"] = df["sex"].astype(pd.CategoricalDtype(sex))
    df["embarked"] = df["embarked"].astype(pd.CategoricalDtype(embarked))
    df["title"] = df["title"].astype(pd.CategoricalDtype(title))
    df["deck"] = df["deck"].astype(pd.CategoricalDtype(deck))
###############################################################################################

    # Create final dataframe with dummies
    df2 = pd.get_dummies(df, columns=["sex", "embarked", "title", "deck"], prefix="dummy")


    return df2

# Fit the model

In [4]:
# Import
df = pd.read_csv("../dataset/titanic_data.csv")
df2 = clean_df(df)

In [5]:
# Split and transform input
sc = StandardScaler()

X = df2.drop(["survived"], axis=1)
sc.fit(X)
X = sc.transform(X)

y = df2["survived"]

In [12]:
# Models

gnb = GaussianNB()

svc = SVC(probability=True)

best_params_rfc = {'bootstrap': True,
                   'max_depth': 25,
                   'max_features': 'auto',
                   'min_samples_leaf': 4,
                   'min_samples_split': 10,
                   'n_estimators': 40}

rfc = RandomForestClassifier(n_jobs=-1,
                             bootstrap=best_params_rfc["bootstrap"],
                             max_depth=best_params_rfc["max_depth"],
                             max_features=best_params_rfc[ "max_features"],
                             min_samples_leaf=best_params_rfc["min_samples_leaf"],
                             min_samples_split=best_params_rfc["min_samples_split"],
                             n_estimators=best_params_rfc[ "n_estimators"])

best_params_lr = {'C': 0.01,
                  'class_weight': 'None',
                  'fit_intercept': True,
                  'max_iter': 50,
                  'penalty': 'l2'}

lr = LogisticRegression(n_jobs=-1,
                       C=best_params_lr["C"],
                       class_weight=best_params_lr["class_weight"],
                       fit_intercept=best_params_lr["fit_intercept"],
                       max_iter=best_params_lr["max_iter"],
                       penalty=best_params_lr["penalty"])

In [14]:
# Voting Classifier Production

estimators = [("Gaussian Naive Bayes", gnb),
              ("Random Forest Classifier", rfc),
              ("Logistic Regression", lr),
              ("Support Vector Classifier", svc)]


# Pipeline

In [15]:
# define the stages of the pipeline

### PIPELINE A

pipeline = Pipeline(steps= [('StandardScaler', StandardScaler()),
                            ('model', VotingClassifier(estimators=estimators, voting='soft'))])

# fit the pipeline model with the training data                            
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('StandardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 VotingClassifier(estimators=[('Gaussian Naive Bayes',
                                               GaussianNB(priors=None,
                                                          var_smoothing=1e-09)),
                                              ('Random Forest Classifier',
                                               RandomForestClassifier(bootstrap=True,
                                                                      ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=25,
                                                                      max_features='aut...
                    

In [6]:
X[7][0]

-1.5501180117967508

In [7]:
sc.transform([df2.drop("survived", axis=1).iloc[7].values])[0][0]

-1.5501180117967508

In [8]:
obs = [1, 39, 0, 0, 0, 1, 0, 1, 39.0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [9]:
sc.transform([obs])[0][0]

-1.5501180117967508

In [10]:
sc2 = load("../modelling/outcomes/scaler.joblib")

In [11]:
sc2.transform([obs])[0][0]

-1.5501180117967508

In [12]:
pipeline = load('./pipeline_model/titanic_pipeline.joblib')

In [26]:
pipeline.predict([df2.drop("survived", axis=1).iloc[6].values])

array([0], dtype=int64)

In [24]:
pipeline.predict([X[6]])

array([1], dtype=int64)

In [23]:
df2["survived"].iloc[6]

1

In [27]:
df2.drop("survived", axis=1).iloc[6]

pclass              1.00000
age                63.00000
sibsp               1.00000
parch               0.00000
fare               77.95830
family_size         2.00000
fare_per_person    38.97915
alone               0.00000
age*class          63.00000
dummy_male          0.00000
dummy_female        1.00000
dummy_S             1.00000
dummy_C             0.00000
dummy_Q             0.00000
dummy_Mr            0.00000
dummy_Mrs           0.00000
dummy_Miss          1.00000
dummy_Master        0.00000
dummy_A             0.00000
dummy_B             0.00000
dummy_C             0.00000
dummy_D             1.00000
dummy_E             0.00000
dummy_F             0.00000
dummy_G             0.00000
dummy_T             0.00000
dummy_Unknown       0.00000
Name: 6, dtype: float64