In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv("../input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv("../input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
#Checking the percentage survived of each variable to decide whether it is a large indicator in survival rate

#Women
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women=sum(women)/len(women)
print("% of women who survived:", rate_women)

#Men
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men=sum(men)/len(men)
print("% of men who survived:", rate_men)

print()
#Ticket Class
for x in [1,2,3]:
    temp_class = train_data.loc[train_data.Pclass == x]["Survived"]
    rate_class = sum(temp_class)/len(temp_class)
    print(f"% of Ticket Class {x} who survived:", rate_class)
    
print()

#Port of Embarkation
for x in ["C","Q","S"]:
    port = train_data.loc[train_data.Embarked == x]["Survived"]
    rate_port = sum(port)/len(port)
    print(f"% from Port {x} who survived:", rate_port)
    
print()

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924

% of Ticket Class 1 who survived: 0.6296296296296297
% of Ticket Class 2 who survived: 0.47282608695652173
% of Ticket Class 3 who survived: 0.24236252545824846

% from Port C who survived: 0.5535714285714286
% from Port Q who survived: 0.38961038961038963
% from Port S who survived: 0.33695652173913043



In [5]:
#Checking to see for NaN values
train_data.isnull()
train_data.isnull().sum()

def cleaning_data(df):
    #Cabin Column is missing majority of the data --> 1014/1308 so dropping it
    df = df.drop(["Cabin"],axis=1)
    
    #Same for Ticket
    df = df.drop(["Ticket"],axis=1)
    
    #Imputing Missing Values in Age Column using median
    df["Age"] = df["Age"].fillna(df["Age"].median())
    
    return df

train_data = cleaning_data(train_data)
test_data = cleaning_data(test_data)

#Imputing the one Fare Value in test_data (as test_data needs all of its rows)
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())

#Removing Unecessary Columns and Getting Dummy Variables
def dummy(df):
    sex_df = pd.get_dummies(df["Sex"], drop_first="True")
    p_class_df = pd.get_dummies(df["Pclass"], drop_first="True")
    embarked_df = pd.get_dummies(df["Embarked"], drop_first="True")
    df = pd.concat([df, sex_df, p_class_df, embarked_df],axis=1)

    df = df.drop(["Sex", "Pclass", "Embarked"],axis=1)
    
    return df

train_data = dummy(train_data)
test_data = dummy(test_data)

In [6]:
#Splits up family data into columns based on family size
def family(df):
    df["FamilySize"] = df["Parch"] + train_data["SibSp"]

    df["Singleton"] = df["FamilySize"].map(lambda s: 1 if s ==1 else 0)
    df["SmallFamily"] = df["FamilySize"].map(lambda s: 1 if 2 <= s <= 4 else 0)
    df["LargeFamily"] = df["FamilySize"].map(lambda s: 1 if 5 <= s else 0)
    
    return df
    
train_data = family(train_data)
test_data = family(test_data)

train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Fare,male,2,3,Q,S,FamilySize,Singleton,SmallFamily,LargeFamily
0,892,"Kelly, Mr. James",34.5,0,0,7.8292,1,0,1,1,0,1.0,1,0,0
1,893,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,7.0,0,0,1,0,1,1.0,1,0,0
2,894,"Myles, Mr. Thomas Francis",62.0,0,0,9.6875,1,1,0,1,0,0.0,0,0,0
3,895,"Wirz, Mr. Albert",27.0,0,0,8.6625,1,0,1,0,1,1.0,1,0,0
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,12.2875,0,0,1,0,1,1.0,1,0,0


In [7]:
#Making Column for Minors
train_data["Minor"] = train_data["Age"].map(lambda x : 1 if x < 16 else 0)
test_data["Minor"] = test_data["Age"].map(lambda x : 1 if x < 16 else 0)

In [8]:
#Map each Title to a common Title (so there are less features to deal with)
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir": "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess": "Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Royalty"
}

def get_titles(df):
    #Takes out the title
    df["Title"]= df["Name"].map(lambda name: name.split(',')[1].split('.')[0].strip())
    #Maps the title to the value in the dictionary
    df["Title"] = df.Title.map(Title_Dictionary)
    return df


def set_titles(df):
    df = get_titles(df)
    df.drop("Name", axis=1, inplace=True)
    
    df_title_dummy = pd.get_dummies(df["Title"], prefix="Title")
    df = pd.concat([df, df_title_dummy], axis=1)
    df.drop("Title", axis=1, inplace=True)
    
    return df

train_data = set_titles(train_data)
test_data = set_titles(test_data)

train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,male,2,3,Q,S,FamilySize,Singleton,SmallFamily,LargeFamily,Minor,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer
0,892,34.5,0,0,7.8292,1,0,1,1,0,1.0,1,0,0,0,0,0,1,0,0
1,893,47.0,1,0,7.0,0,0,1,0,1,1.0,1,0,0,0,0,0,0,1,0
2,894,62.0,0,0,9.6875,1,1,0,1,0,0.0,0,0,0,0,0,0,1,0,0
3,895,27.0,0,0,8.6625,1,0,1,0,1,1.0,1,0,0,0,0,0,1,0,0
4,896,22.0,1,1,12.2875,0,0,1,0,1,1.0,1,0,0,0,0,0,0,1,0


In [9]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Age", "SibSp", "Parch", "Fare", "male", 2, 3, "Q", "S", "FamilySize", "Minor", "Title_Master", "Title_Miss", "Title_Mr", 
            "Title_Mrs", "Title_Officer", "Singleton", "SmallFamily", "LargeFamily"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X,y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
