In [169]:
# Import all libraries needed
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [170]:
# Import data needed and separate them into training and test dataset
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

# Just to be careful, we will copy them here
original_train =  train.copy()
original_test = test.copy()
PassengerId = test['PassengerId']

# Check the columns
print(train.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [171]:
# Check the sample data
print(train.head())
print(test.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [172]:
# Get the statistics
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [173]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [174]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return "Unknown"

data = [train, test]

# Create new features
for partial in data: 
    partial["Title"] = "Unknown"
    partial["Title"] = partial["Name"].apply(get_title)



In [175]:
# Find the proportion of survived based on features
train["Survived"][train["Age"] < 18].value_counts(normalize = True, dropna = True)
train["Survived"][train["Age"] >= 18].value_counts(normalize = True, dropna = True)

# Adult tends to not survived

0    0.618968
1    0.381032
Name: Survived, dtype: float64

In [176]:
import numpy as np

data = [train, test]

# Remove NA values
for partial in data:
    partial["Fare"] = partial["Fare"].fillna(train.Fare.median())
    partial["Embarked"] = partial["Embarked"].fillna('S')
    # Fill age by using random number
    avg_age = partial["Age"].mean()
    var_age = partial["Age"].std()
    null_count = partial["Age"].isnull().sum()
    age_null_random_list = np.random.randint(avg_age - var_age, avg_age + var_age, size=null_count)
    partial.loc[np.isnan(partial['Age']), 'Age'] = age_null_random_list
    partial['Age'] = partial['Age'].astype(int)
    
    # Group data
    partial['Title'] = partial['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    partial['Title'] = partial['Title'].replace('Mlle', 'Miss')
    partial['Title'] = partial['Title'].replace('Ms', 'Miss')
    partial['Title'] = partial['Title'].replace('Mme', 'Mrs')
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    partial['Title'] = partial['Title'].map(title_mapping)
    partial['Title'] = partial['Title'].fillna(0)
    
    # Mapping Embarked
    partial['Embarked'] = partial['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Sex
    partial['Sex'] = partial['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
     # Mapping Fare
    partial.loc[ partial['Fare'] <= 7.91, 'Fare'] 						        = 0
    partial.loc[(partial['Fare'] > 7.91) & (partial['Fare'] <= 14.454), 'Fare'] = 1
    partial.loc[(partial['Fare'] > 14.454) & (partial['Fare'] <= 31), 'Fare']   = 2
    partial.loc[ partial['Fare'] > 31, 'Fare'] 							        = 3
    partial['Fare'] = partial['Fare'].astype(int)
    
    # Mapping Age
    partial.loc[ partial['Age'] <= 16, 'Age'] 					       = 0
    partial.loc[(partial['Age'] > 16) & (partial['Age'] <= 32), 'Age'] = 1
    partial.loc[(partial['Age'] > 32) & (partial['Age'] <= 48), 'Age'] = 2
    partial.loc[(partial['Age'] > 48) & (partial['Age'] <= 64), 'Age'] = 3
    partial.loc[ partial['Age'] > 64, 'Age'] ;

    
train.isnull().sum()
    
    


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [177]:
test.isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
dtype: int64

In [178]:
# Convert categorical variables to factor
train["Embarked"] = train["Embarked"].astype("category")
train["Title"] = train["Title"].astype("category")
train["Sex"] = train["Sex"].astype("category")

In [179]:
# Cleaning the data
# Preparing the data
# Drop unused columns
train = train.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'])
test = test.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'])

In [180]:
# Calculate Gini impurity
def get_gini_impurity(survived_count, total):
    survived_prob = survived_count / total
    survived_prob_c  = (1 -  survived_prob)
    rand_survived_prob = survived_prob
    rand_survived_prob_c = (1 - rand_survived_prob)
    mislabelling_survived_prob = survived_prob_c * rand_survived_prob
    mislabelling_not_survived_prob = survived_prob * rand_survived_prob_c
    gini_impurity = mislabelling_not_survived_prob + mislabelling_survived_prob
    return gini_impurity
    

In [181]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).agg(['mean', 'count', 'sum'])

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,mean,count,sum
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.742038,314,233
1,0.188908,577,109


In [182]:
# Let's use our 'original_train' dataframe to check the sex distribution for each title.
# We use copy() again to prevent modifications in out original_train dataset
title_and_sex = original_train.copy()[['Name', 'Sex']]

# Create 'Title' feature
title_and_sex['Title'] = title_and_sex['Name'].apply(get_title)

# Map 'Sex' as binary feature
title_and_sex['Sex'] = title_and_sex['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Table with 'Sex' distribution grouped by 'Title'
title_and_sex[['Title', 'Sex']].groupby(['Title'], as_index=False).agg(['mean', 'count', 'sum'])


Unnamed: 0_level_0,Sex,Sex,Sex
Unnamed: 0_level_1,mean,count,sum
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,1.0,1,1
Col,1.0,2,2
Don,1.0,1,1
Dr,0.857143,7,6
Jonkheer,1.0,1,1
Lady,0.0,1,0
Major,1.0,2,2
Master,1.0,40,40
Miss,0.0,182,0
Mlle,0.0,2,0


In [183]:
# Gini Impurity of starting node
gini_impurity_starting_node = get_gini_impurity(342, 891)
gini_impurity_starting_node

0.47301295786144265

In [184]:
gini_impurity_men = get_gini_impurity(109, 577)
gini_impurity_men

0.3064437162277843

In [185]:
# Gini Impurity decrease if node splited for 'female' observations
gini_impurity_women = get_gini_impurity(233, 314)
gini_impurity_women

0.3828350034484158

In [186]:
# Gini Impurity decrease if node splited by Sex
men_weight = 577/891
women_weight = 314/891
weighted_gini_impurity_sex_split = (gini_impurity_men * men_weight) + (gini_impurity_women * women_weight)

sex_gini_decrease = weighted_gini_impurity_sex_split - gini_impurity_starting_node
sex_gini_decrease


-0.13964795747285214

In [187]:
# Gini Impurity decrease of node for observations with Title == 1 == Mr
gini_impurity_title_1 = get_gini_impurity(81, 517)
gini_impurity_title_1

0.26425329886377663

In [188]:
# Gini Impurity decrease if node splited for observations with Title != 1 != Mr
gini_impurity_title_others = get_gini_impurity(261, 374)
gini_impurity_title_others

0.42170207898424317

In [189]:
# Gini Impurity decrease if node splited for observations with Title == 1 == Mr
title_1_weight = 517/891
title_others_weight = 374/891
weighted_gini_impurity_title_split = (gini_impurity_title_1 * title_1_weight) + (gini_impurity_title_others * title_others_weight)

title_gini_decrease = weighted_gini_impurity_title_split - gini_impurity_starting_node
title_gini_decrease

-0.14267004758907514

In [190]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64

In [None]:
# Validate with KFold
cv = KFold(n_splits=10)            # Desired number of Cross Validation folds
accuracies = list()
max_attributes = len(list(test))
depth_range = range(1, max_attributes + 1)

# Testing max_depths from 1 to max attributes
# Uncomment prints for details about each Cross Validation pass
for depth in depth_range:
    fold_accuracy = []
    tree_model = RandomForestClassifier(n_estimators = 300, max_depth = depth)
    # print("Current max depth: ", depth, "\n")
    for train_fold, valid_fold in cv.split(train):
        f_train = train.loc[train_fold] # Extract train data with cv indices
        f_valid = train.loc[valid_fold] # Extract valid data with cv indices

        model = tree_model.fit(X = f_train.drop(['Survived'], axis=1), 
                               y = f_train["Survived"]) # We fit the model with the fold train data
        valid_acc = model.score(X = f_valid.drop(['Survived'], axis=1), 
                                y = f_valid["Survived"])# We calculate accuracy with the fold validation data
        fold_accuracy.append(valid_acc)

    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))

Max Depth  Average Accuracy
        1          0.783421
        2          0.795718
        3          0.802459
        4          0.814844
        5          0.818252
        6          0.814869
        7          0.813758


In [192]:
# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our models
y_train = train['Survived']
x_train = train.drop(['Survived'], axis=1).values 
x_test = test.values

# Create Decision Tree with max_depth = 3
decision_tree = RandomForestClassifier(n_estimators = 300, max_depth = 6)
decision_tree.fit(x_train, y_train)

# Predicting results for test dataset
y_pred = decision_tree.predict(x_test)
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": y_pred
    })
submission.to_csv('submission.csv', index=False)