In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
data = pd.read_csv("../data/tested.csv")

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [58]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [59]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [60]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [61]:
len(data)

418

In [62]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [63]:
# We'll fill the missing values in the cabin with the mean age value.

mean_age=int(data['Age'].mean())
#using this we can replace null values with the mean of age column

data['Age'] = data['Age'].fillna(mean_age)
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [64]:
# We'll fill the missing values in the cabin with the mode cabin value.

print(data['Cabin'].mode())

data['Cabin'].fillna(data['Cabin'].mode()[0])
data.isnull().sum()

0    B57 B59 B63 B66
Name: Cabin, dtype: object


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [65]:
# Checking for duplicate rows


if len(data) > len (data.drop_duplicates()):
    print("Duplicate Rows :")
else:
    print ("There are no duplicate rows")

There are no duplicate rows


In [66]:
# Removing Duplicates

data = data.drop_duplicates()
len (data)

418

In [67]:
# Checking Data Types

data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

The Pclass (Passenger Class), PassengerID and Survived are better more appropriately String data types since they represent categorical information.

In [68]:
# Converting Data Types

data["PassengerId"] = data["PassengerId"].astype(str)
# data["Survived"] = data["Survived"].astype(str)
data["Pclass"] = data["Pclass"].astype(str)

data.dtypes

PassengerId     object
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [69]:
# Method for Summary Statistics

data.describe()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,417.0
mean,0.363636,30.216507,0.447368,0.392344,35.627188
std,0.481622,12.635016,0.89676,0.981429,55.907576
min,0.0,0.17,0.0,0.0,0.0
25%,0.0,23.0,0.0,0.0,7.8958
50%,0.0,30.0,0.0,0.0,14.4542
75%,1.0,35.75,1.0,0.0,31.5
max,1.0,76.0,8.0,9.0,512.3292


In [71]:
# Grouping by Pclass and Survived (we can use the .sum() method to add all the Survived by passenger class since survived is "1"and not survived is "0"

data.groupby(["Pclass"])["Survived"].sum()

Pclass
1    50
2    30
3    72
Name: Survived, dtype: int64

In [73]:
# Average age of Survived vs Not Survived

data.groupby(["Survived"])["Age"].mean()

Survived
0    30.210188
1    30.227566
Name: Age, dtype: float64

In [76]:
# Gender distribution amongs passengers

data.groupby(["Sex"])["PassengerId"].count()

Sex
female    152
male      266
Name: PassengerId, dtype: int64

In [83]:
# Survival by Gender and Class

data.groupby(["Pclass", "Sex"])["Survived"].sum()

Pclass  Sex   
1       female    50
        male       0
2       female    30
        male       0
3       female    72
        male       0
Name: Survived, dtype: int64

In [85]:
# Create Age Groups

bins = [0, 18, 40, 60, np.inf]
names = ['<18', '18-40', '40-60', '60+']

data['AgeGroup'] = pd.cut(data['Age'], bins, labels=names)

In [86]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'AgeGroup'],
      dtype='object')

In [87]:
# Survival by AgeGroup

data.groupby(["AgeGroup"])["Survived"].sum()

  data.groupby(["AgeGroup"])["Survived"].sum()


AgeGroup
<18      24
18-40    98
40-60    26
60+       4
Name: Survived, dtype: int64