### Step 1: Read & Split DataFrame

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import re
import numpy as np

In [17]:
df = pd.read_csv('titanic_data/train.csv')

In [18]:
df.shape
df.columns
#df.head()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [19]:
X_col = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [20]:
X = df[X_col]
y = df[['Survived']]
X.shape, y.shape

((891, 9), (891, 1))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((668, 9), (223, 9), (668, 1), (223, 1))

In [22]:
pd.options.mode.chained_assignment=None

### Step 2: Feature Engineering

In [114]:
def titanic_features(X_train):
    X_train['Titles'] = X_train['Name'].str.extract('([A-Z]\w{0,}\.)', expand=True)
    title_ages = X_train[['Age', 'Titles']]
    title_ages = title_ages.groupby('Titles')[['Age']].mean()
    new_ages = []
    for i, row in X_train.iterrows():
        if pd.isna(row['Age']):
            val = title_ages.loc[row['Titles']]['Age']
            new_ages.append(val)
        else:
            val = row['Age']
            new_ages.append(val)

    new_ages = [round(i, ndigits=0) for i in new_ages]
    X_train = X_train.assign(New_Age = new_ages)
    X_train['Name'].str.extract('([A-Z]\w{0,}\.)', expand=True)[0].value_counts()
    embark_cols = pd.get_dummies(X_train['Embarked'])
    X_train = pd.merge(left=X_train, right=embark_cols, how='inner', on=X_train.index).set_index('key_0')
    embark_cols2 = pd.get_dummies(X_train['Titles'])
    X_train = pd.merge(left=X_train, right=embark_cols2, how='inner', on=X_train.index).set_index('key_0')
    X_train['Gender'] = X_train.Sex.map({'female':1, 'male':0})
    
    titles = ['Col.', 'Dona.', 'Capt.', 'Countess.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.', 'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.']
    for i in titles:
        if i not in X_train.columns:
            X_train[f'{i}'] = np.zeros(shape= (len(X_train),))
        else:
            print(i +' already in df')
    X_train.drop(['Name', 'Age', 'Embarked', 'Titles', 'Sex'], axis=1, inplace=True)
    X_train = X_train[['PassengerId', 'Pclass', 'SibSp', 'Parch', 'New_Age', 'Gender', 'Fare', 'S', 'Q', 'C', 'Col.', 'Dona.', 'Capt.', 'Countess.', 'Dr.', 'Jonkheer.', 'Lady.', 'Major.', 'Master.', 'Miss.', 'Mlle.', 'Mme.', 'Mr.', 'Mrs.', 'Ms.', 'Rev.', 'Sir.']]
    
   # X_train = [X_train[i]=25 for i in X_train]
   # X_train = X_train.dropna(axis=0)
    X_train = X_train.fillna(method= 'bfill')
    del X_train.index.name
    print('Engineering done.')
    return X_train

In [110]:
X_train.shape

(668, 27)

In [25]:
X_train=titanic_features(X_train)

Col. already in df
Capt. already in df
Countess. already in df
Dr. already in df
Lady. already in df
Major. already in df
Master. already in df
Miss. already in df
Mlle. already in df
Mme. already in df
Mr. already in df
Mrs. already in df
Ms. already in df
Rev. already in df
Engineering done.


In [27]:
y_train.head()

Unnamed: 0,Survived
520,1
470,0
102,0
426,1
179,0


### Step 3: Train & Test ML model

In [28]:
# initialize logistic regression function
m = LogisticRegression()

In [129]:
from sklearn.tree import DecisionTreeClassifier
m_tree = DecisionTreeClassifier()

In [130]:
m.fit(X_train, y_train)
m_tree.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [132]:
m.coef_, m.intercept_

(array([[ 1.72418846e-04, -9.29738810e-01, -5.88323242e-01,
         -2.75187761e-01, -2.76490844e-02,  1.65545537e+00,
          4.34113241e-03,  2.51405813e-01,  5.14347268e-01,
          8.45776959e-01, -3.64714397e-03,  0.00000000e+00,
         -1.17440697e-01,  4.78658273e-02, -1.57225078e-02,
          0.00000000e+00,  8.53051951e-02,  2.36444060e-02,
          1.66222152e+00,  3.39654720e-01,  6.06927449e-02,
          3.21108683e-02, -8.97249287e-01,  9.66876896e-01,
          1.22949120e-01, -5.92374933e-01,  0.00000000e+00]]),
 array([1.71488673]))

In [31]:
m.score(X_train, y_train)

0.8368263473053892

### Step 4: Prediction

In [133]:
# read test data set
df_test = pd.read_csv('titanic_data/test.csv')

In [134]:
#df_test['PassengerId']
df_test.shape

(418, 11)

In [135]:
# use Feature Engineering steps on test-data
# use titanic_features function for it
dfx = titanic_features(df_test)
#dfx.isna().values.any()
dfx.isna().sum()
dfx[dfx['New_Age'].isna()]

Col. already in df
Dona. already in df
Dr. already in df
Master. already in df
Miss. already in df
Mr. already in df
Mrs. already in df
Ms. already in df
Rev. already in df
Engineering done.


Unnamed: 0,PassengerId,Pclass,SibSp,Parch,New_Age,Gender,Fare,S,Q,C,...,Major.,Master.,Miss.,Mlle.,Mme.,Mr.,Mrs.,Ms.,Rev.,Sir.


In [136]:
dfx.head()

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,New_Age,Gender,Fare,S,Q,C,...,Major.,Master.,Miss.,Mlle.,Mme.,Mr.,Mrs.,Ms.,Rev.,Sir.
0,892,3,0,0,34.0,0,7.8292,0,1,0,...,0.0,0,0,0.0,0.0,1,0,0,0,0.0
1,893,3,1,0,47.0,1,7.0,1,0,0,...,0.0,0,0,0.0,0.0,0,1,0,0,0.0
2,894,2,0,0,62.0,0,9.6875,0,1,0,...,0.0,0,0,0.0,0.0,1,0,0,0,0.0
3,895,3,0,0,27.0,0,8.6625,1,0,0,...,0.0,0,0,0.0,0.0,1,0,0,0,0.0
4,896,3,1,1,22.0,1,12.2875,1,0,0,...,0.0,0,0,0.0,0.0,0,1,0,0,0.0


In [137]:
test_pass_ids = dfx['PassengerId']
#test_pass_ids

In [121]:
predict = m.predict(dfx)
len(predict)

418

In [139]:
predict_tree = m_tree.predict(dfx)
len(predict_tree)

418

In [140]:
kaggle_df = pd.DataFrame(dfx['PassengerId'])

In [141]:
kaggle_df.shape

(418, 1)

In [142]:
#kaggle_df['Survived'] = predict
kaggle_df['Survived'] = predict_tree

In [143]:
kaggle_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,1
4,896,0


In [144]:
kaggle_df.set_index('PassengerId', inplace=True)

In [145]:
#del kaggle_df.index.name
kaggle_df.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,1
896,0


In [146]:
kaggle_df.to_csv('kaggel_submission_tree.csv')