In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Path to the files
train_data_path = os.getcwd() + '/train.csv'
test_data_path = os.getcwd() + '/test.csv'

# Datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Create a copy of the test data
test_data_copy = test_data.copy(deep=True)

# View the dataset
test_data_copy.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# Delete unwanted columns
del(train_data['PassengerId'])
del(train_data['Name'])
del(train_data['Ticket'])
del(train_data['Fare'])
del(train_data['Cabin'])

del(test_data_copy['PassengerId'])
del(test_data_copy['Name'])
del(test_data_copy['Ticket'])
del(test_data_copy['Fare'])
del(test_data_copy['Cabin'])

In [4]:
# View the remaining data
test_data_copy.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,34.5,0,0,Q
1,3,female,47.0,1,0,S
2,2,male,62.0,0,0,Q
3,3,male,27.0,0,0,S
4,3,female,22.0,1,1,S


In [5]:
# Fill NaN values with the mode of their columns
train_data.fillna({'Age': 24}, inplace=True)
train_data.fillna({'Embarked': 'S'}, inplace=True)

test_data_copy.fillna({'Age': 24}, inplace=True)
test_data_copy.fillna({'Embarked': 'S'}, inplace=True)

# Convert the ages to categories
def age_category(age):
    if age > 0 and age <= 16:
        return 'Kid'
    elif age > 16 and age <= 30:
        return 'Youth'
    elif age > 30 and age <= 60:
        return 'Adult'
    elif age > 60:
        return 'Old'
    
train_data['Age'] = train_data['Age'].apply(age_category)

test_data_copy['Age'] = test_data_copy['Age'].apply(age_category)

# View the data
test_data_copy.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,Adult,0,0,Q
1,3,female,Adult,1,0,S
2,2,male,Old,0,0,Q
3,3,male,Youth,0,0,S
4,3,female,Youth,1,1,S


In [6]:
# Integer equivalent of values
sex = {'male':0, 'female':1}
age = {'Kid':0, 'Youth':1, 'Adult':2, 'Old':3}
embarked = {'S':0, 'C':1, 'Q':2}

# Data encoding
train_data['Sex'] = train_data['Sex'].replace(sex)
train_data['Age'] = train_data['Age'].replace(age)
train_data['Embarked'] = train_data['Embarked'].replace(embarked)

test_data_copy['Sex'] = test_data_copy['Sex'].replace(sex)
test_data_copy['Age'] = test_data_copy['Age'].replace(age)
test_data_copy['Embarked'] = test_data_copy['Embarked'].replace(embarked)

In [7]:
# View the data
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,1,1,0,0
1,1,1,1,2,1,0,1
2,1,3,1,1,0,0,0
3,1,1,1,2,1,0,0
4,0,3,0,2,0,0,0


In [8]:
# Convert dataframe to numpy array
training_data = np.array(train_data)
test_data_copy = np.array(test_data_copy)

# Split the training data to X and Y features
x_features = np.delete(training_data, 0, axis=1)
y_features = training_data[:, 0]

# Split the data to train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, train_size=0.7)

In [9]:
# Create the model
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

# Calculate the train and test score
train_score = clf.score(x_train, y_train)
test_score = clf.score(x_test, y_test)
print(train_score, test_score)

0.8651685393258427 0.8134328358208955


In [10]:
submitting_pred = clf.predict(test_data_copy)

# Creating a submission file of the predictions
import csv

with open('gender_submission.csv', mode='w') as submission_file:
    individual_writer = csv.writer(submission_file, delimiter=',')
    
    individual_writer.writerow(['PassengerId', 'Survived'])
    
    for i in range(len(test_data)):
        passenger_id = test_data.loc[i]['PassengerId']
        individual_writer.writerow([passenger_id, submitting_pred[i]])