In [1]:
from modin import pandas as pd
import numpy as np
from collections import Counter
from copy import deepcopy
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing

### Data Pre-processing and Feature Engineering

 - Gender has been converted into a Numerical attribute (Female - 0, Male - 1)
 
 
 - Embarked is the station at which the passengers got in, again this is converted into a Numerical attribute (Q-1, C-2, S-3, Others - 0)
 
 
 - Cabin - The first alphabet in cabin indicates the Deck. Turns out Deck has an importance when it comes to survival.
 - Deck is indicated by the 1st character of cabin. Later these alphabets are label encoded. 
 
 
 - SibSp - This is the total number of siblings/spouses on board
 - Parch - This is the total number of parents and children on board
 - These two are combined to form family_size
 
 
 - Fare, Age and Pclass have been left as is. 

In [2]:
train = pd.read_csv("~/Downloads/train.csv")
test = pd.read_csv("~/Downloads/test.csv")
print(train.isna().sum())
print(test.isna().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


To request implementation, send an email to feature_requests@modin.org.


In [3]:
train.corr()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


### Filling the missing values
 - In both test and train dataset, Age and Cabin are two features that are missing majorly
 - One way to fill this data is to fill it with the mean() or median()
 - However for Cabin, it'd help if we create a new cabin called "Missing"/"M"
 - But for age, it could also help if we see the co-relation of other variables so that we can fill the data accordingly.
 - We can see that both Fare, Age is most co-related to Pclass, so instead of filling with mean or median we can fill it with the mean/median of the corresponding pClass.

In [4]:
class_1_age = train[train["Pclass"] == 1]["Age"].mean()
class_2_age = train[train["Pclass"] == 2]["Age"].mean()
class_3_age = train[train["Pclass"] == 3]["Age"].mean()
print(class_1_age, class_2_age, class_3_age)

38.233440860215055 29.87763005780347 25.14061971830986


In [5]:
class_1_fare = train[train["Pclass"] == 1]["Fare"].mean()
class_2_fare = train[train["Pclass"] == 2]["Fare"].mean()
class_3_fare = train[train["Pclass"] == 3]["Fare"].mean()
print(class_1_fare, class_2_fare, class_3_fare)

84.1546875 20.662183152173913 13.675550101832993


In [6]:
def fill_age(age, pclass):
    if np.isnan(age):
        if pclass == 1:
            return class_1_age
        elif pclass == 2:
            return class_2_age
        elif pclass == 3:
            return class_3_age
    else:
        return age

def fill_fare(fare, pclass):
    if np.isnan(fare):
        if pclass == 1:
            return class_1_fare
        elif pclass == 2:
            return class_2_fare
        elif pclass == 3:
            return class_3_fare
    else:
        return fare
    

In [7]:
def get_feature(entry):
    feature = {"Sex": 0, "Embarked": 0, "Cabin": "M"}
    
    '''
    Note: Fields such as City T3, is_female, un-listed sources and products haven't been added in the feature list 
    as its indicative from other similar fields. i.e., if is_male = 0, its considered as female by the algorithm.
    Steps like this has been added to avoid multicollinearity. 
    '''
    
    # Set features
    if entry["Sex"].lower() == "male":
        feature["Sex"] = 1
    
    if entry["Embarked"] == "Q":
        feature["Embarked"] = 1
    elif entry["Embarked"] == "C":
        feature["Embarked"] = 2
    elif entry["Embarked"] == "S":
        feature["Embarked"] = 3
    
    feature["Age"] = entry["Age"]
    feature["Pclass"] = entry["Pclass"]
    feature["Family_Size"] = entry["SibSp"] + entry["Parch"]
    feature["Fare"] = entry["Fare"]
    if isinstance(entry["Cabin"], str):
        feature["Cabin"] = entry["Cabin"][0] 
        
    try:
        feature["Survived"] = entry["Survived"]
    except KeyError:
        pass
    return feature

In [8]:
dt_df = deepcopy(train)
del dt_df["Name"], dt_df["Ticket"], dt_df["PassengerId"] 

# Fill all the variables 
dt_df["Age"] = dt_df.apply(lambda x : fill_age(x["Age"], x["Pclass"]), axis=1)

all_features = dt_df.apply(lambda entry: get_feature(entry), axis=1)

dt_df = pd.DataFrame.from_dict(list(all_features[0]))
le = preprocessing.LabelEncoder()
dt_df["Cabin"] = le.fit_transform(dt_df["Cabin"])

features_cols = list(dt_df.columns)
features_cols.remove("Survived")

X = dt_df[features_cols] # Features
y = dt_df["Survived"] # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) 
print(X_train.head())



   Sex  Embarked  Cabin   Age  Pclass  Family_Size     Fare  Survived
0    1         3      7  22.0       3            1   7.2500         0
1    0         2      2  38.0       1            1  71.2833         1
2    0         3      7  26.0       3            0   7.9250         1
3    0         3      2  35.0       1            1  53.1000         1
4    1         3      7  35.0       3            0   8.0500         0
     Sex  Embarked  Cabin       Age  Pclass  Family_Size     Fare
35     1         3      7  42.00000       1            1  52.0000
46     1         1      7  25.14062       3            1  15.5000
453    1         2      2  49.00000       1            1  89.1042
291    0         2      1  19.00000       1            1  91.0792
748    1         3      3  19.00000       1            1  53.1000


In [None]:
# Create Decision Tree classifer object
lr = DecisionTreeClassifier()
clf = BaggingClassifier(base_estimator=lr, n_estimators=100)
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Model Trained on", len(X_train), "entries.")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(y_pred)

### Testing Module

In [None]:
dt_df = deepcopy(test)
del dt_df["Name"], dt_df["Ticket"], dt_df["PassengerId"]  

# Fill all the variables 
dt_df["Age"] = dt_df.apply(lambda x : fill_age(x["Age"],y["Pclass"]), axis=1)
dt_df["Fare"] = dt_df.apply(lambda x : fill_age(x["Fare"],y["Pclass"]), axis=1)

all_features = dt_df.apply(lambda entry: get_feature(entry), axis=1)
dt_df = pd.DataFrame.from_dict(list(all_features[0]))

le = preprocessing.LabelEncoder()
dt_df["Cabin"] = le.fit_transform(dt_df["Cabin"])

survived = clf.predict(dt_df)
passenger_id = test["PassengerId"] 
result_df = pd.DataFrame({"PassengerId": passenger_id, "Survived": survived})
result_df.to_csv("titanic_results.csv", index=False)