**Import Modules**

In [1]:
# Import necessary modules

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

**Reading Dataset**

In [2]:
# Import the titanic train dataset as a Pandas dataframe

df = pd.read_csv("titanic_train.csv")
df.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13.0,,,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,,,Croatia,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,,,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,4.0,,"Cornwall / Akron, OH",1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,,,"Barre, Co Washington, VT",0


In [3]:
# Since the model takes the input as the person's age, gender and fare: discard the rest of the attributes

df.drop(['passenger_id','pclass','name','sibsp','parch','ticket','cabin','embarked','boat','body','home.dest'],axis='columns',inplace=True)
df

Unnamed: 0,sex,age,fare,survived
0,female,,7.7333,1
1,male,38.0,8.6625,0
2,female,30.0,24.1500,0
3,female,54.0,23.0000,1
4,male,40.0,13.0000,0
...,...,...,...,...
845,male,55.0,50.0000,0
846,male,58.0,29.7000,0
847,female,24.0,26.0000,1
848,female,3.0,13.7750,0


**Data Preprocessing for Training Dataset**

> Since the training dataset has string values and empty values, we need to format it before using it to build our model



In [4]:
# For the input dataframe, convert the string values to integer.
# (could also be done using LabelEncoder)

inputs = df.drop('survived', axis='columns')
inputs.sex = inputs.sex.map({'male': 1, 'female': 2})
inputs

Unnamed: 0,sex,age,fare
0,2,,7.7333
1,1,38.0,8.6625
2,2,30.0,24.1500
3,2,54.0,23.0000
4,1,40.0,13.0000
...,...,...,...
845,1,55.0,50.0000
846,1,58.0,29.7000
847,2,24.0,26.0000
848,2,3.0,13.7750


In [5]:
# Check whether there are null values present

inputs.isna().sum()

sex       0
age     174
fare      1
dtype: int64

In [6]:
# Since there are null values are present for age and fare, 
# fill the empty values with the mean of the existing data.

inputs.age = inputs.age.fillna(inputs.age.mean())
inputs.fare = inputs.fare.fillna(inputs.fare.mean())
inputs

Unnamed: 0,sex,age,fare
0,2,29.519847,7.7333
1,1,38.000000,8.6625
2,2,30.000000,24.1500
3,2,54.000000,23.0000
4,1,40.000000,13.0000
...,...,...,...
845,1,55.000000,50.0000
846,1,58.000000,29.7000
847,2,24.000000,26.0000
848,2,3.000000,13.7750


In [7]:
# Create the target dataframe

target = df.survived
target

0      1
1      0
2      0
3      1
4      0
      ..
845    0
846    0
847    1
848    0
849    0
Name: survived, Length: 850, dtype: int64

**Building the Model**

In [8]:
# Split the dataset into training dataset and test data for building the model

X_train, X_test, y_train, y_test = train_test_split(inputs.values,target,test_size=0.2)

In [9]:
#Create the model using the Decision Tree Classifier

model = tree.DecisionTreeClassifier()

In [10]:
# To display the settings of the classifier function

from sklearn import set_config
set_config(print_changed_only=False)

In [11]:
# Fit the dataset onto the model

model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=None, splitter='best')

**Prediction and Accuracy**

In [12]:
# Predict a sample data entry

model.predict([[2,30,24.15]])

array([0])

In [13]:
# Display the score of the model

model.score(X_test, y_test)

0.8117647058823529

**Testing of given Test Data**

> Since the test dataset has string values and empty values, we need to format it before using it to test our model

In [14]:
# Import the test data as a Pandas dataframe

df_test = pd.read_csv("titanic_test.csv")
df_test

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C,B,,"Haverford, PA"
1,1150,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5000,,S,,,
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0000,B71,S,,,"Montreal, PQ"
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.1250,,S,,,"Finland Sudbury, ON"
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,,S,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,1194,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S,,,
455,403,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0000,,S,,,"England / Detroit, MI"
456,108,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C,4,,
457,510,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5000,,S,,,"Halesworth, England"


In [15]:
# Create a separate dataframe for the given Passenger IDs

p_id = df_test['passenger_id']
p_id

0       295
1      1150
2        89
3      1063
4      1020
       ... 
454    1194
455     403
456     108
457     510
458    1265
Name: passenger_id, Length: 459, dtype: int64

In [16]:
# Drop the unnecessary attributes from the Test Dataframe; keep age, gender, fare

df_test.drop(['passenger_id','pclass','name','sibsp','parch','ticket','cabin','embarked','boat','body','home.dest'],axis='columns',inplace=True)
df_test

Unnamed: 0,sex,age,fare
0,male,17.0,110.8833
1,male,,14.5000
2,male,31.0,52.0000
3,male,41.0,7.1250
4,male,21.0,7.8958
...,...,...,...
454,male,,7.8958
455,male,23.0,13.0000
456,female,,110.8833
457,male,16.0,10.5000


**Data Preprocessing for Test Dataset**

In [17]:
# Convert the string values to integers

df_test.sex = df_test.sex.map({'male': 1, 'female': 2})
df_test

Unnamed: 0,sex,age,fare
0,1,17.0,110.8833
1,1,,14.5000
2,1,31.0,52.0000
3,1,41.0,7.1250
4,1,21.0,7.8958
...,...,...,...
454,1,,7.8958
455,1,23.0,13.0000
456,2,,110.8833
457,1,16.0,10.5000


In [18]:
# Check for null values present

df_test.isna().sum()

sex      0
age     89
fare     0
dtype: int64

In [19]:
# Since there are null values are present for age,
# fill the empty values with the mean of the existing data.

df_test.age = df_test.age.fillna(df_test.age.mean())
df_test

Unnamed: 0,sex,age,fare
0,1,17.000000,110.8833
1,1,30.541216,14.5000
2,1,31.000000,52.0000
3,1,41.000000,7.1250
4,1,21.000000,7.8958
...,...,...,...
454,1,30.541216,7.8958
455,1,23.000000,13.0000
456,2,30.541216,110.8833
457,1,16.000000,10.5000


**Prediction using Test Data**

In [20]:
# Run the model on the given test data and see the predicted output

pred = model.predict(df_test.values)
pred

array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,

In [21]:
# Store the predicted result as a dataframe

df_result = pd.DataFrame(pred, columns = ['survived_status'])
df_result

Unnamed: 0,survived_status
0,0
1,1
2,0
3,0
4,0
...,...
454,1
455,0
456,1
457,0


In [22]:
# Create a separate column to store the final 
# predicted result against the respective Passenger IDs

df_result['passenger_id'] = p_id
df_result

Unnamed: 0,survived_status,passenger_id
0,0,295
1,1,1150
2,0,89
3,0,1063
4,0,1020
...,...,...
454,1,1194
455,0,403
456,1,108
457,0,510
