In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv('/Users/chungpinhsu/github/datascience/train.csv')
test_df = pd.read_csv('/Users/chungpinhsu/github/datascience/test.csv')
combine = [train_df, test_df]

In [3]:
# preview the data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
train_df.info()
print('-'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null

In [6]:
train_df.describe()
# Review survived rate using `percentiles=[.61, .62]` knowing our problem description mentions 38% survival rate.
# Review Parch distribution using `percentiles=[.75, .8]`
# SibSp distribution `[.68, .69]`
# Age and Fare `[.1, .2, .3, .4, .5, .6, .7, .8, .9, .99]`

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Mitchell, Mr. Henry Michael",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [8]:
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape

Before (891, 12) (418, 11) (891, 12) (418, 11)


('After', (891, 10), (418, 9), (891, 10), (418, 9))

In [9]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

((891, 8), (418, 8))

In [10]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [11]:
for dataset in combine:
    dataset.replace(r'\s+', np.nan, regex=True)
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex   Age  SibSp  Parch      Fare Embarked
0           0       3    0  22.0      1      0    7.2500        S
1           1       1    1  38.0      1      0   71.2833        C
2           1       3    1  26.0      0      0    7.9250        S
3           1       1    1  35.0      1      0   53.1000        S
4           0       3    0  35.0      0      0    8.0500        S
5           0       3    0   NaN      0      0    8.4583        Q
6           0       1    0  54.0      0      0   51.8625        S
7           0       3    0   2.0      3      1   21.0750        S
8           1       3    1  27.0      0      2   11.1333        S
9           1       2    1  14.0      1      0   30.0708        C
10          1       3    1   4.0      1      1   16.7000        S
11          1       1    1  58.0      0      0   26.5500        S
12          0       3    0  20.0      0      0    8.0500        S
13          0       3    0  39.0      1      5   31.2750        S
14        

In [12]:
for dataset in combine:
    dataset['Age'].fillna(0)
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex   Age  SibSp  Parch      Fare Embarked
0           0       3    0  22.0      1      0    7.2500        S
1           1       1    1  38.0      1      0   71.2833        C
2           1       3    1  26.0      0      0    7.9250        S
3           1       1    1  35.0      1      0   53.1000        S
4           0       3    0  35.0      0      0    8.0500        S
5           0       3    0   NaN      0      0    8.4583        Q
6           0       1    0  54.0      0      0   51.8625        S
7           0       3    0   2.0      3      1   21.0750        S
8           1       3    1  27.0      0      2   11.1333        S
9           1       2    1  14.0      1      0   30.0708        C
10          1       3    1   4.0      1      1   16.7000        S
11          1       1    1  58.0      0      0   26.5500        S
12          0       3    0  20.0      0      0    8.0500        S
13          0       3    0  39.0      1      5   31.2750        S
14        

In [13]:
for dataset in combine:
    dataset['Age'].fillna(0, inplace=True)
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex   Age  SibSp  Parch      Fare Embarked
0           0       3    0  22.0      1      0    7.2500        S
1           1       1    1  38.0      1      0   71.2833        C
2           1       3    1  26.0      0      0    7.9250        S
3           1       1    1  35.0      1      0   53.1000        S
4           0       3    0  35.0      0      0    8.0500        S
5           0       3    0   0.0      0      0    8.4583        Q
6           0       1    0  54.0      0      0   51.8625        S
7           0       3    0   2.0      3      1   21.0750        S
8           1       3    1  27.0      0      2   11.1333        S
9           1       2    1  14.0      1      0   30.0708        C
10          1       3    1   4.0      1      1   16.7000        S
11          1       1    1  58.0      0      0   26.5500        S
12          0       3    0  20.0      0      0    8.0500        S
13          0       3    0  39.0      1      5   31.2750        S
14        

In [14]:
for dataset in combine:
    dataset['Age'] = dataset['Age'].astype(int)
    print(dataset['Age'].dtype)

int64
int64


In [15]:
for dataset in combine:    
    dataset.loc[ 0 < dataset['Age'] <= 10, 'Age'] = 1
    dataset.loc[(dataset['Age'] > 10) & (dataset['Age'] <= 0), 'Age'] = 0
train_df.head()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [16]:
for dataset in combine:
    for i in dataset['Age']:
        if 0 < i <= 10:
            i = 1
        else:
            i = 0
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age  SibSp  Parch      Fare Embarked
0           0       3    0   22      1      0    7.2500        S
1           1       1    1   38      1      0   71.2833        C
2           1       3    1   26      0      0    7.9250        S
3           1       1    1   35      1      0   53.1000        S
4           0       3    0   35      0      0    8.0500        S
5           0       3    0    0      0      0    8.4583        Q
6           0       1    0   54      0      0   51.8625        S
7           0       3    0    2      3      1   21.0750        S
8           1       3    1   27      0      2   11.1333        S
9           1       2    1   14      1      0   30.0708        C
10          1       3    1    4      1      1   16.7000        S
11          1       1    1   58      0      0   26.5500        S
12          0       3    0   20      0      0    8.0500        S
13          0       3    0   39      1      5   31.2750        S
14          0       3    

In [17]:
for dataset in combine:    
    dataset.loc[(dataset['Age'] <= 10) & (dataset['Age'] > 0), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 10) & (dataset['Age'] <= 0), 'Age'] = 0
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22,1,0,7.25,S
1,1,1,1,38,1,0,71.2833,C
2,1,3,1,26,0,0,7.925,S
3,1,1,1,35,1,0,53.1,S
4,0,3,0,35,0,0,8.05,S


In [18]:
for dataset in combine:    
    dataset.loc[(dataset['Age'] <= 10) & (dataset['Age'] > 0), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 10) | (dataset['Age'] <= 0), 'Age'] = 0
train_df.head()
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age  SibSp  Parch      Fare Embarked
0           0       3    0    0      1      0    7.2500        S
1           1       1    1    0      1      0   71.2833        C
2           1       3    1    0      0      0    7.9250        S
3           1       1    1    0      1      0   53.1000        S
4           0       3    0    0      0      0    8.0500        S
5           0       3    0    0      0      0    8.4583        Q
6           0       1    0    0      0      0   51.8625        S
7           0       3    0    1      3      1   21.0750        S
8           1       3    1    0      0      2   11.1333        S
9           1       2    1    0      1      0   30.0708        C
10          1       3    1    1      1      1   16.7000        S
11          1       1    1    0      0      0   26.5500        S
12          0       3    0    0      0      0    8.0500        S
13          0       3    0    0      1      5   31.2750        S
14          0       3    

In [19]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,0,0,1,0,7.25,S,2
1,1,1,1,0,1,0,71.2833,C,2
2,1,3,1,0,0,0,7.925,S,1
3,1,1,1,0,1,0,53.1,S,2
4,0,3,0,0,0,0,8.05,S,1


In [20]:
for dataset in combine:
    dataset['FamilySize'] = dataset['FamilySize'].astype(int)
    print(dataset['FamilySize'].dtype)

int64
int64


In [21]:
for dataset in combine:    
    dataset.loc[(dataset['FamilySize'] >= 2) & (dataset['FamilySize'] <= 4), 'FamilySize'] = 1
    dataset.loc[(dataset['FamilySize'] > 4) | (dataset['FamilySize'] < 2), 'FamilySize'] = 0
train_df.head()
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age  SibSp  Parch      Fare Embarked  FamilySize
0           0       3    0    0      1      0    7.2500        S           0
1           1       1    1    0      1      0   71.2833        C           0
2           1       3    1    0      0      0    7.9250        S           0
3           1       1    1    0      1      0   53.1000        S           0
4           0       3    0    0      0      0    8.0500        S           0
5           0       3    0    0      0      0    8.4583        Q           0
6           0       1    0    0      0      0   51.8625        S           0
7           0       3    0    1      3      1   21.0750        S           0
8           1       3    1    0      0      2   11.1333        S           0
9           1       2    1    0      1      0   30.0708        C           0
10          1       3    1    1      1      1   16.7000        S           0
11          1       1    1    0      0      0   26.5500        S           0

In [23]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,892,3,0,0,0,0,7.8292,Q,0
1,893,3,1,0,1,0,7.0,S,0
2,894,2,0,0,0,0,9.6875,Q,0
3,895,3,0,0,0,0,8.6625,S,0
4,896,3,1,0,1,1,12.2875,S,0


In [24]:
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,0,0,7.25,S,0
1,1,1,1,0,71.2833,C,0
2,1,3,1,0,7.925,S,0
3,1,1,1,0,53.1,S,0
4,0,3,0,0,8.05,S,0


In [25]:
for dataset in combine:    
    dataset.loc[(dataset['Embarked'] == 'C'), 'Embarked'] = 1
    dataset.loc[(dataset['Embarked'] == 'S') | (dataset['Embarked'] == 'Q'), 'Embarked'] = 0
train_df.head()
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age      Fare Embarked  FamilySize
0           0       3    0    0    7.2500        0           0
1           1       1    1    0   71.2833        1           0
2           1       3    1    0    7.9250        0           0
3           1       1    1    0   53.1000        0           0
4           0       3    0    0    8.0500        0           0
5           0       3    0    0    8.4583        0           0
6           0       1    0    0   51.8625        0           0
7           0       3    0    1   21.0750        0           0
8           1       3    1    0   11.1333        0           0
9           1       2    1    0   30.0708        1           0
10          1       3    1    1   16.7000        0           0
11          1       1    1    0   26.5500        0           0
12          0       3    0    0    8.0500        0           0
13          0       3    0    0   31.2750        0           0
14          0       3    1    0    7.8542        0     

In [26]:
for dataset in combine:
    dataset['Embarked'].fillna(0, inplace=True)
    dataset['Embarked'] = dataset['Embarked'].astype(int)
    print(dataset['Embarked'].dtype)

int64
int64


In [27]:
for dataset in combine:
    dataset['Fare'].fillna(0, inplace=True)
    dataset['Pclass'].fillna(0, inplace=True)
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset['Pclass'] = dataset['Pclass'].astype(int)
    print(dataset['Fare'].dtype)
    print(dataset['Pclass'].dtype)

int64
int64
int64
int64


In [28]:
for dataset in combine:    
    dataset.loc[(dataset['Pclass'] != 1), 'Pclass'] = 0
train_df.head()
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age  Fare  Embarked  FamilySize
0           0       0    0    0     7         0           0
1           1       1    1    0    71         1           0
2           1       0    1    0     7         0           0
3           1       1    1    0    53         0           0
4           0       0    0    0     8         0           0
5           0       0    0    0     8         0           0
6           0       1    0    0    51         0           0
7           0       0    0    1    21         0           0
8           1       0    1    0    11         0           0
9           1       0    1    0    30         1           0
10          1       0    1    1    16         0           0
11          1       1    1    0    26         0           0
12          0       0    0    0     8         0           0
13          0       0    0    0    31         0           0
14          0       0    1    0     7         0           0
15          1       0    1    0    16   

In [29]:
for dataset in combine:    
    dataset.loc[(dataset['Fare'] > 51) | (dataset['Fare'] = 51), 'Fare'] = 1
    dataset.loc[dataset['Fare'] < 51, 'Fare'] = 0
train_df.head()
print(combine[0])
print(combine[1])

SyntaxError: invalid syntax (<ipython-input-29-088bd420f05f>, line 2)

In [30]:
for dataset in combine:    
    dataset.loc[dataset['Fare'] >= 51, 'Fare'] = 1
    dataset.loc[dataset['Fare'] < 51, 'Fare'] = 0
train_df.head()
print(combine[0])
print(combine[1])

     Survived  Pclass  Sex  Age  Fare  Embarked  FamilySize
0           0       0    0    0     0         0           0
1           1       1    1    0     0         1           0
2           1       0    1    0     0         0           0
3           1       1    1    0     0         0           0
4           0       0    0    0     0         0           0
5           0       0    0    0     0         0           0
6           0       1    0    0     0         0           0
7           0       0    0    1     0         0           0
8           1       0    1    0     0         0           0
9           1       0    1    0     0         1           0
10          1       0    1    1     0         0           0
11          1       1    1    0     0         0           0
12          0       0    0    0     0         0           0
13          0       0    0    0     0         0           0
14          0       0    1    0     0         0           0
15          1       0    1    0     0   

In [31]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [32]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

78.790000000000006

In [33]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,2.48606
0,Pclass,1.418464
2,Age,0.757601
4,Embarked,0.469545
3,Fare,0.0
5,FamilySize,0.0


In [34]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

78.680000000000007

In [35]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

70.480000000000004

In [36]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

77.780000000000001

In [37]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron



78.680000000000007

In [38]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

78.790000000000006

In [39]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd



66.780000000000001

In [40]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

79.010000000000005

In [41]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

79.010000000000005

In [42]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,79.01
8,Decision Tree,79.01
2,Logistic Regression,78.79
7,Linear SVC,78.79
0,Support Vector Machines,78.68
5,Perceptron,78.68
4,Naive Bayes,77.78
1,KNN,70.48
6,Stochastic Gradient Decent,66.78


In [43]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
# submission.to_csv('/Users/chungpinhsu/github/datascience/submission.csv', index=False)

In [44]:
submissiontest = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
# submission.to_csv('/Users/chungpinhsu/github/datascience/submissiontest.csv', index=False)