In [1]:
import pandas as pd
from joblib import dump

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [38]:
def replace_titles(x):
    title=x['title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

def clean_df(df, verbose=False):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df, columns = ["pclass", "name", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked"])
    df.columns = df.columns.str.lower()
    # Drop "boat", "body", "home.dest" and "ticket". The first two hold information if the passenger survived (boat) or if it didn't and the body was recovered (body).
    ### Why is this not working??
    try:
        df = df.drop(["boat", "body", "home.dest", "ticket"], axis=1)
    except KeyError:
        if verbose:
            print("Any of these features are not in the dataframe: boat, body, home.dest, ticket")

    # Just a few observations, drop them
    df = df.drop(df[(pd.isnull(df["embarked"]))].index)

    # Replace NULL with median
    df["fare"] = df["fare"].fillna(df["fare"].median())
    df["age"] = df["age"].fillna(df["age"].median())

    # from: https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/

    # Extract titles
    df["title"] = df["name"].str.extract(r'(Mrs|Mr|Master|Miss|Major|Rev|Dr|Ms|Mlle|Col|Capt|Mme|Countess|Don|Jonkheer)')
    df['title']=df.apply(replace_titles, axis=1)
    df = df.drop("name", axis=1)

    # Extract Deck
    df['deck'] = df["cabin"].str[0].fillna("Unknown")
    df = df.drop("cabin", axis=1)

    # Create new family_size column
    df['family_size'] = df['sibsp']+df['parch'] + 1 #counting the passenger itself
    df['fare_per_person'] = df['fare']/df['family_size']
    df['alone'] = df['family_size'].apply(lambda x: 1 if x==1 else 0)

    # Because why not
    df['age*class'] = df['age']*df['pclass']
####################################### TEST BENCHMARK #######################################
    # dummy categories: "sex", "embarked", "title", "deck"
    sex = ["male", "female"]
    embarked = ["S", "C", "Q"]
    title = ["Mr", "Mrs", "Miss", "Master"]
    deck = ["A", "B", "C", "D", "E", "F", "G", "T", "Unknown"]    

    df["sex"] = df["sex"].astype(pd.CategoricalDtype(sex))
    df["embarked"] = df["embarked"].astype(pd.CategoricalDtype(embarked))
    df["title"] = df["title"].astype(pd.CategoricalDtype(title))
    df["deck"] = df["deck"].astype(pd.CategoricalDtype(deck))
###############################################################################################

    # Create final dataframe with dummies
    df2 = pd.get_dummies(df, columns=["sex", "embarked", "title", "deck"], prefix="dummy")


    return df2

# Fit the model

In [3]:
# Import
df = pd.read_csv("../dataset/titanic_data.csv")
#df2 = clean_df(df)

In [10]:
# Split and transform input
sc = StandardScaler()

X = df2.drop(["survived", "boat", "body", "home.dest", "ticket"], axis=1)
sc.fit(X)
X = sc.transform(X)

y = df2["survived"]

NameError: name 'df2' is not defined

In [7]:
# Models

gnb = GaussianNB()

svc = SVC(probability=True)

best_params_rfc = {'bootstrap': True,
                   'max_depth': 25,
                   'max_features': 'auto',
                   'min_samples_leaf': 4,
                   'min_samples_split': 10,
                   'n_estimators': 40}

rfc = RandomForestClassifier(n_jobs=-1,
                             bootstrap=best_params_rfc["bootstrap"],
                             max_depth=best_params_rfc["max_depth"],
                             max_features=best_params_rfc[ "max_features"],
                             min_samples_leaf=best_params_rfc["min_samples_leaf"],
                             min_samples_split=best_params_rfc["min_samples_split"],
                             n_estimators=best_params_rfc[ "n_estimators"])

best_params_lr = {'C': 0.01,
                  'class_weight': 'None',
                  'fit_intercept': True,
                  'max_iter': 50,
                  'penalty': 'l2'}

lr = LogisticRegression(n_jobs=-1,
                       C=best_params_lr["C"],
                       class_weight=best_params_lr["class_weight"],
                       fit_intercept=best_params_lr["fit_intercept"],
                       max_iter=best_params_lr["max_iter"],
                       penalty=best_params_lr["penalty"])

In [8]:
# Voting Classifier Production

estimators = [("Gaussian Naive Bayes", gnb),
              ("Random Forest Classifier", rfc),
              ("Logistic Regression", lr),
              ("Support Vector Classifier", svc)]

#vc_prod = VotingClassifier(estimators=estimators, voting='soft')

#### Pipeline

In [8]:
# define the stages of the pipeline

### PIPELINE A

pipeline = Pipeline(steps= [('StandardScaler', StandardScaler()),
                            #('CleanDF', clean_df(x)),
                            ('model', VotingClassifier(estimators=estimators, voting='soft'))])

# fit the pipeline model with the training data                            
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('StandardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 VotingClassifier(estimators=[('Gaussian Naive Bayes',
                                               GaussianNB(priors=None,
                                                          var_smoothing=1e-09)),
                                              ('Random Forest Classifier',
                                               RandomForestClassifier(bootstrap=True,
                                                                      ccp_alpha=0.0,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=25,
                                                                      max_features='aut...
                    

In [42]:
### PIPELINE B

X = df.drop(["survived", "boat", "body", "home.dest", "ticket"], axis=1)
y = df["survived"]

pipeB = Pipeline(steps = [("PreProcess", clean_df(X.values)),
                          ("StandardScaler", StandardScaler()),
                          ("Model", VotingClassifier(estimators=estimators, voting='soft'))
                         ])

# fit the pipeline model with the training data                            
pipeB.fit(X, y)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [42]:
# user_input = [[1, "Andrews, Mr. Thomas Jr", "male",39.0,0,0,0.0,"A36","S"]]

# X_input = clean_df(user_input).values

# print(pipeline.predict(X_input))
# print(df.iloc[i]["survived"])

# pred = pipeline.predict(X_input)
# y = df.iloc[i]["survived"]

[0]
0


# Export the pipeline

In [46]:
dump(pipeline, filename="./pipeline_model/titanic_pipeline.joblib")

['./pipeline_model/titanic_pipeline.joblib']

In [34]:
for var, coef in zip(list(df2.drop("survived", axis=1).columns), list(lr.coef_[0])):
    print(var, coef)

pclass -0.23156540020526195
age -0.09291260314186896
sibsp -0.19467973450666795
parch -0.04156462680125151
fare 0.05997498009387026
family_size -0.15078079734018335
fare_per_person 0.05497008142222499
alone -0.09593557494537071
age*class -0.200746172837031
dummy_male -0.308056186738117
dummy_female 0.3080561867381171
dummy_S -0.07627704258582577
dummy_C 0.10048223124827664
dummy_Q -0.019532055572217388
dummy_Mr -0.3310382301588715
dummy_Mrs 0.23371456272179644
dummy_Miss 0.10299936576467876
dummy_Master 0.16956775938402702
dummy_A 0.020605462010975056
dummy_B 0.06154515700276333
dummy_C 0.006790457867818288
dummy_D 0.08302648247068586
dummy_E 0.12419546535999203
dummy_F 0.07533874391532254
dummy_G -0.026431717625191038
dummy_T -0.03571321765514116
dummy_Unknown -0.14720359384563173


In [44]:
feature_imp = pd.Series(rfc.feature_importances_,index=list(df2.drop("survived", axis=1).columns)).sort_values(ascending=False)
feature_imp

dummy_female       0.157080
dummy_male         0.122606
dummy_Mr           0.107268
age*class          0.104286
fare_per_person    0.087265
fare               0.082521
age                0.057946
pclass             0.057525
dummy_Mrs          0.042855
dummy_Unknown      0.037416
family_size        0.033465
sibsp              0.028605
dummy_Miss         0.021712
dummy_S            0.010046
parch              0.009407
dummy_C            0.009082
dummy_Master       0.007188
alone              0.006662
dummy_E            0.005409
dummy_B            0.003016
dummy_Q            0.002409
dummy_C            0.002387
dummy_D            0.002343
dummy_A            0.000972
dummy_F            0.000466
dummy_G            0.000063
dummy_T            0.000000
dtype: float64