Link to dataset: https://www.kaggle.com/jamesleslie/titanic-cleaned-data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("titanic_train.csv")
test = pd.read_csv("titanic_test.csv")

Target column: Survived

In [3]:
train.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_Size
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs,1
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr,0


In [4]:
train.shape

(891, 14)

In [5]:
train.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived,Family_Size
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,29.445196,32.204208,0.381594,446.0,2.308642,0.523008,0.383838,0.904602
std,13.244896,49.693429,0.806057,257.353842,0.836071,1.102743,0.486592,1.613459
min,0.42,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,22.0,7.9104,0.0,223.5,2.0,0.0,0.0,0.0
50%,30.0,14.4542,0.0,446.0,3.0,0.0,0.0,0.0
75%,35.5,31.0,0.0,668.5,3.0,1.0,1.0,1.0
max,80.0,512.3292,6.0,891.0,3.0,8.0,1.0,10.0


In [6]:
train.isnull().sum()

Age              0
Cabin          687
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived         0
Ticket           0
Title            0
Family_Size      0
dtype: int64

In [7]:
test.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_Size
0,34.5,,Q,7.8292,"Kelly, Mr. James",0,892,3,male,0,,330911,Mr,0
1,47.0,,S,7.0,"Wilkes, Mrs. James (Ellen Needs)",0,893,3,female,1,,363272,Mrs,1
2,62.0,,Q,9.6875,"Myles, Mr. Thomas Francis",0,894,2,male,0,,240276,Mr,0
3,27.0,,S,8.6625,"Wirz, Mr. Albert",0,895,3,male,0,,315154,Mr,0
4,22.0,,S,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,896,3,female,1,,3101298,Mrs,2


In [8]:
test.shape

(418, 14)

In [9]:
test.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived,Family_Size
count,418.0,418.0,418.0,418.0,418.0,418.0,0.0,418.0
mean,29.812201,35.561214,0.392344,1100.5,2.26555,0.447368,,0.839713
std,13.006431,55.856789,0.981429,120.810458,0.841838,0.89676,,1.519072
min,0.17,0.0,0.0,892.0,1.0,0.0,,0.0
25%,22.0,7.8958,0.0,996.25,1.0,0.0,,0.0
50%,30.0,14.4542,0.0,1100.5,3.0,0.0,,0.0
75%,35.875,31.471875,0.0,1204.75,3.0,1.0,,1.0
max,76.0,512.3292,9.0,1309.0,3.0,8.0,,10.0


In [10]:
test.isnull().sum()

Age              0
Cabin          327
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived       418
Ticket           0
Title            0
Family_Size      0
dtype: int64

We observe the cabin column has missing values, so we drop it in both the train and the test set

Also we observe that Name column won't give any useful information, nor will PassengerId, so we drop it

In [11]:
train.drop(["Cabin","Name", "PassengerId"], axis=1, inplace=True)

In [12]:
train.shape

(891, 11)

In [13]:
test.drop(["Cabin","Name", "PassengerId"], axis=1, inplace=True)

In [14]:
test.shape

(418, 11)

In [15]:
train.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_Size
0,22.0,S,7.25,0,3,male,1,0.0,A/5 21171,Mr,1
1,38.0,C,71.2833,0,1,female,1,1.0,PC 17599,Mrs,1
2,26.0,S,7.925,0,3,female,0,1.0,STON/O2. 3101282,Miss,0
3,35.0,S,53.1,0,1,female,1,1.0,113803,Mrs,1
4,35.0,S,8.05,0,3,male,0,0.0,373450,Mr,0


In [16]:
train['Ticket'].nunique

<bound method IndexOpsMixin.nunique of 0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object>

Ticket doesn't give us useful info

In [17]:
train.drop(["Ticket"], axis=1, inplace=True)
test.drop(["Ticket"], axis=1, inplace=True)

In [18]:
train.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,Family_Size
0,22.0,S,7.25,0,3,male,1,0.0,Mr,1
1,38.0,C,71.2833,0,1,female,1,1.0,Mrs,1
2,26.0,S,7.925,0,3,female,0,1.0,Miss,0
3,35.0,S,53.1,0,1,female,1,1.0,Mrs,1
4,35.0,S,8.05,0,3,male,0,0.0,Mr,0


In [19]:
data = pd.concat([train,test])

In [24]:
data.shape

(1309, 10)

In [25]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

#for embarked
encoder.fit(data["Embarked"]) 
train["Embarked"]= encoder.transform(train["Embarked"])
test["Embarked"]= encoder.transform(test["Embarked"])

#for gender
encoder.fit(data["Sex"]) 
train["Sex"]= encoder.transform(train["Sex"]) 
test["Sex"]= encoder.transform(test["Sex"]) 

#for Title
encoder.fit(data["Title"]) 
train["Title"]= encoder.transform(train["Title"])
test["Title"]= encoder.transform(test["Title"]) 
train.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,Family_Size
0,22.0,2,7.25,0,3,1,1,0.0,3,1
1,38.0,0,71.2833,0,1,0,1,1.0,4,1
2,26.0,2,7.925,0,3,0,0,1.0,2,0
3,35.0,2,53.1,0,1,0,1,1.0,4,1
4,35.0,2,8.05,0,3,1,0,0.0,3,0


In [26]:
test.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,Family_Size
0,34.5,1,7.8292,0,3,1,0,,3,0
1,47.0,2,7.0,0,3,0,1,,4,1
2,62.0,1,9.6875,0,2,1,0,,3,0
3,27.0,2,8.6625,0,3,1,0,,3,0
4,22.0,2,12.2875,1,3,0,1,,4,2


In [29]:
col = list(data.columns)
for i in range(data.shape[1]):
    print(col[i],": ", data[col[i]].nunique())

Age :  99
Embarked :  3
Fare :  281
Parch :  8
Pclass :  3
Sex :  2
SibSp :  7
Survived :  2
Title :  6
Family_Size :  9


In [27]:
col = list(train.columns)
for i in range(train.shape[1]):
    print(col[i],": ", train[col[i]].nunique())

Age :  89
Embarked :  3
Fare :  248
Parch :  7
Pclass :  3
Sex :  2
SibSp :  7
Survived :  2
Title :  6
Family_Size :  9


In [28]:
col = list(test.columns)
for i in range(test.shape[1]):
    print(col[i],": ", test[col[i]].nunique())

Age :  81
Embarked :  3
Fare :  169
Parch :  8
Pclass :  3
Sex :  2
SibSp :  7
Survived :  0
Title :  6
Family_Size :  9
