In [1]:
#importing relevant libraries
import pandas as pd


In [2]:
#loading the data
data=pd.read_csv('titanic.csv')
data

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1000,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",2,male,27.0,0,0,211536,13.0000,,S,0
887,888,"Graham, Miss. Margaret Edith",1,female,19.0,0,0,112053,30.0000,B42,S,1
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",3,female,,1,2,W./C. 6607,23.4500,,S,0
889,890,"Behr, Mr. Karl Howell",1,male,26.0,0,0,111369,30.0000,C148,C,1


In [3]:
#preprocessing

In [4]:
#removing irrelevant columns
data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [13]:
#defining the inputs/targets
inputs=data.drop('Survived',axis='columns')
targets=data.Survived

In [14]:
#assigning dummies to the sex feature classes
dummies = pd.get_dummies(inputs.Sex)
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [15]:
#According the the variable trap theory, just one of the two
# collumns (male or female) is enough to represent them both

inputs.drop(['Sex','female'],axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,male
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [18]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [16]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [17]:
#Filling the null values in 'age' with its mean
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,male
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [20]:
#train/test splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,targets,test_size=0.3)

In [21]:
#Naive bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.7798507462686567

In [22]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,male
430,1,28.0,26.55,1
784,3,25.0,7.05,1
815,1,29.699118,0.0,1
179,3,36.0,0.0,1
407,2,3.0,18.75,1
675,3,18.0,7.775,1
150,2,51.0,12.525,1
408,3,21.0,7.775,1
417,2,18.0,13.0,0
633,1,29.699118,0.0,1


In [23]:
y_test[0:10]

430    1
784    0
815    0
179    0
407    1
675    0
150    0
408    0
417    1
633    0
Name: Survived, dtype: int64

In [24]:
#Calculating the score using cross validation
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

array([0.744     , 0.832     , 0.776     , 0.7983871 , 0.74193548])