In [1]:
# Kaggle Competition
# Titanic: Machine Learning from Disaster


In [294]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [295]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [296]:
print(train.shape)
train.dtypes

# y value is 'Survived'
# Embarked have 3 unique values. C, Q, S

(891, 12)


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [297]:
print(test.shape)
test.dtypes

# Need to predict Survived with these columns

(418, 11)


PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [298]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [299]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [300]:
train.isnull().sum()
# 177 data is missing in Age column
# 687 data is missing in Cabin column


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [301]:
# Let's Convert Sex into numeric value
train['Sex'] = train['Sex'].map({"male":0, "female":1})
# 0 : Male
# 1 : Female
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [302]:
train['Embarked'].unique()


array(['S', 'C', 'Q', nan], dtype=object)

In [303]:
# Let's drop the row with nan value
train = train.dropna(axis=0, subset=['Embarked'])
train['Embarked'].value_counts()
# S: 0
# C: 1
# Q: 2

# Convert to numeric value
train['Embarked']= train['Embarked'].map({"S":0, "C":1, "Q":2})
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0
5,6,0,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,2
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1


In [304]:
train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=False)
train = train.drop(['Name'], axis=1)
train['Title'].value_counts()

Mr          517
Miss        181
Mrs         124
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Don           1
Capt          1
Countess      1
Mme           1
Sir           1
Lady          1
Ms            1
Jonkheer      1
Name: Title, dtype: int64

In [305]:
train['Title'].value_counts()
train['Title'] = train['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2})
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,0,0.0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,1,2.0
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,0,1.0
3,4,1,1,1,35.0,1,0,113803,53.1,C123,0,2.0
4,5,0,3,0,35.0,0,0,373450,8.05,,0,0.0


In [306]:
train['Title'].value_counts()

0.0    517
1.0    181
2.0    124
Name: Title, dtype: int64

In [307]:
train['Title'].isnull().sum()

67

In [308]:
train['Title'].fillna(3.0, inplace=True)


In [309]:
train['Title'].value_counts()
train[train['Title'] ==3.0]


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
7,8,0,3,0,2.00,3,1,349909,21.0750,,0,3.0
16,17,0,3,0,2.00,4,1,382652,29.1250,,2,3.0
30,31,0,1,0,40.00,0,0,PC 17601,27.7208,,1,3.0
50,51,0,3,0,7.00,4,1,3101295,39.6875,,0,3.0
59,60,0,3,0,11.00,5,2,CA 2144,46.9000,,0,3.0
63,64,0,3,0,4.00,3,2,347088,27.9000,,0,3.0
65,66,1,3,0,,1,1,2661,15.2458,,1,3.0
78,79,1,2,0,0.83,0,2,248738,29.0000,,0,3.0
125,126,1,3,0,12.00,1,0,2651,11.2417,,1,3.0
149,150,0,2,0,42.00,0,0,244310,13.0000,,0,3.0


In [310]:
# Let's fill nan value in Age column with the average value of each title
train['Age'].fillna(train.groupby('Title')['Age'].transform('median'), inplace=True)





In [311]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Sex            889 non-null int64
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Cabin          202 non-null object
Embarked       889 non-null int64
Title          889 non-null float64
dtypes: float64(3), int64(7), object(2)
memory usage: 90.3+ KB


In [312]:
# Since Cabin has too many null points, I'll drop the column
# from the data
train = train.drop(['Cabin'], axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Sex            889 non-null int64
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked       889 non-null int64
Title          889 non-null float64
dtypes: float64(3), int64(7), object(1)
memory usage: 83.3+ KB


In [313]:
len(train['Ticket'].unique().tolist())
# Length of unique value of ticket is 680. It cannot be classified.
# I'll remove it


680

In [314]:
len(train['Age'].unique())
# There are 88 different ages. Let's divide them into 
# 0: 0~5 Preschool
# 1: 5~12 Gradeschooler
# 2: 12~19 Teen
# 3: 20~29 twenties
# 4: 30~39 thirties
# 5: 40~49 forties
# 6: 50~100 senior
train.loc[train['Age'] <= 5 ,'Age'] = 0
train.loc[(train['Age'] > 5) & (train['Age'] <= 12), 'Age'] = 1
train.loc[(train['Age'] > 12) & (train['Age'] <= 19), 'Age'] = 2
train.loc[(train['Age'] > 19) & (train['Age'] <= 29), 'Age'] = 3
train.loc[(train['Age'] > 29) & (train['Age'] <= 39), 'Age'] = 4
train.loc[(train['Age'] > 39) & (train['Age'] <= 49), 'Age'] = 5
train.loc[train['Age'] > 49, 'Age'] = 6


In [315]:
train['Age'].value_counts()

4.0    302
3.0    256
2.0     95
5.0     89
6.0     73
0.0     44
1.0     30
Name: Age, dtype: int64

In [316]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,0,3.0,1,0,A/5 21171,7.25,0,0.0
1,2,1,1,1,4.0,1,0,PC 17599,71.2833,1,2.0
2,3,1,3,1,3.0,0,0,STON/O2. 3101282,7.925,0,1.0
3,4,1,1,1,4.0,1,0,113803,53.1,0,2.0
4,5,0,3,0,4.0,0,0,373450,8.05,0,0.0


In [317]:
train['Familymember'] = train['SibSp'] + train['Parch']
train = train.drop(['SibSp', 'Parch'], axis=1)
train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Title,Familymember
0,1,0,3,0,3.0,A/5 21171,7.25,0,0.0,1
1,2,1,1,1,4.0,PC 17599,71.2833,1,2.0,1
2,3,1,3,1,3.0,STON/O2. 3101282,7.925,0,1.0,0
3,4,1,1,1,4.0,113803,53.1,0,2.0,1
4,5,0,3,0,4.0,373450,8.05,0,0.0,0


In [318]:
# Since Pclass tells about the what ticket and fare means,
# I'll drop ticket and fare
train = train.drop(['PassengerId','Ticket', 'Fare'], axis=1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Title,Familymember
0,0,3,0,3.0,0,0.0,1
1,1,1,1,4.0,1,2.0,1
2,1,3,1,3.0,0,1.0,0
3,1,1,1,4.0,0,2.0,1
4,0,3,0,4.0,0,0.0,0


In [329]:
# We're done with Feature Engineering.
# Let's apply Min-Max Scaling before Machine Learning
# Reason for Min-Max Scaling is ''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

X = train[['Pclass', 'Sex', 'Age', 'Embarked', 'Title', 'Familymember']]
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =0)


In [330]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [333]:
# Now, let's create the classifier
# Since, it's classification problem, we have to choose
# algorithms for supervised learning algorithm
# 1. KNN
# 2. SVM
# 3. XGBoost
# I'll choose XGBoost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, shuffle=True, random_state=0)
cross_val_score(clf, X,y, cv=kfold, n_jobs=1, scoring='accuracy')

ModuleNotFoundError: No module named 'xgboost'