In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

survival = pd.read_csv("gender_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

Get their titles by string splitting:

In [2]:
def get_title(data_frame):
    # Get names from data frame
    name_data = data_frame["Name"]
    
    # Obtain titles
    data_frame["Title"] = [name.split(", ", 1)[1].split(".", 1)[0] for name in name_data]
    
    # Find all titles
    titles = []
    for title in data_frame["Title"]:
        if title not in titles:
            titles.append(title)        
    
    return data_frame, titles

train, titles = get_title(train)
print(titles)


['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess', 'Jonkheer']


In [3]:
def title2int(data):
    data["Title"].replace(["Major", "Capt", "Sir", "Dr", "Don", "Mlle", "Mme", "Ms", "Dona", "Lady", "the Countess", "Jonkheer", "Col", "Rev"],
                           ["Mr", "Mr", "Mr", "Mr", "Mr", "Miss", "Miss", "Miss", "Mrs", "Mrs", "Mrs", "Other", "Other", "Other"], inplace = True)
    data["Title"].replace(["Mr", "Miss", "Mrs", "Master", "Other"], [0, 1, 2, 3, 4], inplace = True)
    return data

train = title2int(train)

In [4]:
def faregroup(data):
    data["Fare_group"]="NaN"
    data.loc[data["Fare"]<65,"Fare_group"]=2
    data.loc[data["Fare"]>=65,"Fare_group"]=1    
    return data

train=faregroup(train)

In [5]:
train["Embarked"].replace(["S", "Q", "C"], [1, 2, 3], inplace = True)

In [6]:
def Cabin_type(data):
    data.loc[data["Cabin"].notnull(), "Cabin"] = "Known"
    data.loc[data["Cabin"].isnull(), "Cabin"] = "Unknown"
    return data

train = Cabin_type(train)

In [7]:
def cab2int(data):
    data.loc[data["Cabin"] == "Known", 'Cabin'] = 2
    data.loc[data["Cabin"] == "Unknown", 'Cabin'] = 1
    return data

train = cab2int(train)

In [8]:
train["Sex"].replace(["male","female"],[1,2],inplace=True)

There are missing ages in the csv, which we need to calculate.

In [9]:
from sklearn.ensemble import RandomForestRegressor

def assign_missing_ages(data_frame, features):
    age_data = data_frame[features]
    known_ages = age_data[age_data.Age.notnull()].as_matrix()
    unknown_ages = age_data[age_data.Age.isnull()].as_matrix()
    
    # Create target and eigenvalues for known ages
    target = known_ages[:, 0]
    eigen_val = known_ages[:, 1:]
    
    # apply random forest regressor
    rfr = RandomForestRegressor(random_state = 0, n_estimators = 2000, n_jobs = -1)
    rfr.fit(eigen_val, target)
    
    # predictions
    Age_predictions = rfr.predict(unknown_ages[:, 1::])
    data_frame.loc[(data_frame.Age.isnull()), "Age"] = Age_predictions
    
    return data_frame, rfr

age_features = ["Age", "Sex", "SibSp", "Parch", "Pclass"]
train, rfr = assign_missing_ages(train, age_features)
train["Age"] = train["Age"].astype(int) #convert complex numbers to integers

  from numpy.core.umath_tests import inner1d
  """
  


New feature - age groups

In [10]:
def agegroup(data):
    data["Age_group"]="NaN"
    data.loc[data["Age"]<10,"Age_group"]=1 #young kids
    data.loc[(data["Age"]>=13)&(data["Age"]<20),"Age_group"]=2 #teens and young adults
    data.loc[(data["Age"]>=20)&(data["Age"]<32),"Age_group"]=3 #adults
    data.loc[(data["Age"]>=32)&(data["Age"]<48),"Age_group"]=4 #middle aged
    data.loc[(data["Age"]>=48)&(data["Age"]<65),"Age_group"]=5 #initial elderly
    data.loc[(data["Age"]>=65),"Age_group"]=6 #elderly
    return data

train=agegroup(train)   

new feature - Group them as being child or non child.

In [11]:
def childgroup(data):
    data["Child"] = "NaN"
    data.loc[data["Age"] <= 18, "Child"] = 0 # Child
    data.loc[data["Age"] > 18, "Child"] = 1 # Adult
    return data

train = childgroup(train)

New feature representing family size, where Parch is the number of parents and children aboard titanic, and SibSp is the number of siblings and spouses aboard.

In [12]:
train["FamSize"] = train["SibSp"] + train["Parch"] + 1

New feature - Group them as having family, and no family.

In [13]:
def family(data):
    data["Fam_group"] = "NaN"
    data.loc[data["FamSize"] == 1, "Fam_group"] = 0 # Single
    data.loc[data["FamSize"] > 1, "Fam_group"] = 1 # Family
    return data

train = family(train)

In [14]:
train_one = train[:]

Make a new dataframe with all new features

In [15]:
columns_titles = ["PassengerId", "Survived", "Pclass", "Title", "Sex", "Child", "Fam_group", "Fare", "Cabin", "Embarked"]
train_one = train_one[columns_titles]
train_one.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Title,Sex,Child,Fam_group,Fare,Cabin,Embarked
0,1,0,3,0,1,1,1,7.25,1,1
1,2,1,1,2,2,1,1,71.2833,2,3
2,3,1,3,1,2,1,0,7.925,1,1
3,4,1,1,2,2,1,1,53.1,2,1
4,5,0,3,0,1,1,0,8.05,1,1
5,6,0,3,0,1,1,0,8.4583,1,2
6,7,0,1,0,1,1,0,51.8625,2,1
7,8,0,3,3,1,0,1,21.075,1,1
8,9,1,3,2,2,1,1,11.1333,1,1
9,10,1,2,2,2,0,1,30.0708,1,3


Make the above features for the test dataset

In [16]:
test["Embarked"].replace(["S", "Q", "C"], [1,2,3], inplace = True)
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

test, test_titles = get_title(test)
test = title2int(test)
test["Sex"].replace(["male", "female"], [1,2], inplace = True)
test = Cabin_type(test)
test = cab2int(test)
test = faregroup(test)

temp_test = test[age_features]
test_unknown_ages = temp_test[test["Age"].isnull()].as_matrix()
test_Age_predictions = rfr.predict(test_unknown_ages[:, 1::])
test.loc[(test["Age"].isnull()), "Age"] = test_Age_predictions
test["Age"] = test["Age"].astype(int)

test = agegroup(test)
test = childgroup(test)
test["FamSize"] = test["SibSp"] + test["Parch"] + 1
test = family(test)

test_one = test[:]
test_columns_titles = ["PassengerId", "Pclass", "Title", "Sex", "Child", "Fam_group", "Fare", "Cabin", "Embarked"]
test_one = test_one[test_columns_titles]
test_one.info()
test_one.head()

  if sys.path[0] == '':


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Title          418 non-null int64
Sex            418 non-null int64
Child          418 non-null int64
Fam_group      418 non-null int64
Fare           418 non-null float64
Cabin          418 non-null int64
Embarked       418 non-null int64
dtypes: float64(1), int64(8)
memory usage: 29.5 KB


Unnamed: 0,PassengerId,Pclass,Title,Sex,Child,Fam_group,Fare,Cabin,Embarked
0,892,3,0,1,1,0,7.8292,1,2
1,893,3,2,2,1,1,7.0,1,1
2,894,2,0,1,1,0,9.6875,1,2
3,895,3,0,1,1,0,8.6625,1,1
4,896,3,2,2,1,1,12.2875,1,1


Use three different algorithms and check their accuracies.

In [17]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

def my_models(model, X_train, Y_train, X_test, Y_test):
    my_model = model.fit(X_train, Y_train)
    
    print(my_model.feature_importances_)
    print(my_model.score(X_train, Y_train))
    
    model_prediction = my_model.predict(X_test)
    acc = metrics.accuracy_score(model_prediction, Y_test)
    
    return acc, my_model

In [18]:
final_features = ["Pclass", "Title", "Sex", "Child", "Fam_group", "Fare", "Cabin", "Embarked"]
final_data = train_one[["Survived"] + final_features]
training, testing = train_test_split(final_data, test_size = 0.3, random_state = 0, stratify = final_data["Survived"])
X_train = training[training.columns[1:]]
Y_train = training[training.columns[:1]]
X_test = testing[testing.columns[1:]]
Y_test = testing[testing.columns[:1]]

In [19]:
tree_model = tree.DecisionTreeClassifier(max_depth = 8, max_leaf_nodes = 7, min_samples_leaf = 10, random_state = 0)
forest_model = RandomForestClassifier(max_depth = 8, max_leaf_nodes = 9, n_estimators = 300, random_state = 0)
gradboost_model = GradientBoostingClassifier(learning_rate =  0.01, max_depth = 7,
                                             max_features = 1.0, n_estimators = 200, subsample = 1.0, random_state = 0)

In [20]:
tree_acc, my_tree = my_models(tree_model, X_train, Y_train, X_test, Y_test)    
print("Decision Tree Accuracy", tree_acc)

forest_acc, my_forest = my_models(forest_model, X_train, Y_train, X_test, Y_test)    
print("Random Forest Accuracy", forest_acc)

gradboost_acc, my_gradboost = my_models(gradboost_model, X_train, Y_train, X_test, Y_test)    
print("Gradient Boosting Accuracy", gradboost_acc)

[0.17282957 0.63346797 0.027044   0.         0.         0.16665846
 0.         0.        ]
0.8282504012841091
('Decision Tree Accuracy', 0.8283582089552238)


  


[0.13131036 0.28242942 0.31809998 0.01663327 0.02454316 0.1464216
 0.06272762 0.01783459]
0.841091492776886
('Random Forest Accuracy', 0.8395522388059702)


  y = column_or_1d(y, warn=True)


[0.09625649 0.3958207  0.01048569 0.02133019 0.01740859 0.39603976
 0.02670432 0.03595427]
0.9213483146067416
('Gradient Boosting Accuracy', 0.8208955223880597)
