In [554]:
# data analysis and wrangling
import keras
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [555]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")
combine = [train_df, test_df]

In [556]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.369942
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [557]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 10)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 8.378]",0.666667
1,"(8.378, 16.336]",0.413043
2,"(16.336, 24.294]",0.355932
3,"(24.294, 32.252]",0.384615
4,"(32.252, 40.21]",0.440678
5,"(40.21, 48.168]",0.342857
6,"(48.168, 56.126]",0.466667
7,"(56.126, 64.084]",0.375
8,"(64.084, 72.042]",0.0
9,"(72.042, 80.0]",0.5


In [558]:
train_df['Age'].fillna(train_df['Age'].dropna().median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].dropna().median(), inplace=True)

for dataset in combine:
    dataset.loc[dataset['Age'] <= 8, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 8) & (dataset['Age'] <= 16), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 24), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 24) & (dataset['Age'] <= 32), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 40), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 48), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 56), 'Age'] = 6
    dataset.loc[(dataset['Age'] > 56) & (dataset['Age'] <= 64), 'Age'] = 7
    dataset.loc[(dataset['Age'] > 64) & (dataset['Age'] <= 72), 'Age'] = 8
    dataset.loc[(dataset['Age'] > 72) & (dataset['Age'] <= 80), 'Age'] = 9
    dataset.loc[dataset['Age'] > 80, 'Age'] = 10
    
    dataset['Age'] = dataset['Age'].astype(int)

In [559]:
train_df = train_df.drop(['AgeBand'], axis=1)

In [560]:
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,2,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,4,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,3,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,4,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,4,0,0,373450,8.05,,S


In [561]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [562]:
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,2,A/5 21171,7.25,,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,4,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,3,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,4,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,4,373450,8.05,,S,1


In [563]:
train_df = train_df.drop(['Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Cabin', 'Name'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,FamilySize
0,1,0,3,male,2,A/5 21171,7.25,S,2
1,2,1,1,female,4,PC 17599,71.2833,C,2
2,3,1,3,female,3,STON/O2. 3101282,7.925,S,1
3,4,1,1,female,4,113803,53.1,S,2
4,5,0,3,male,4,373450,8.05,S,1


In [564]:

for dataset in combine:
    dataset.loc[dataset['Sex'] == 'male', 'Sex'] = 0
    dataset.loc[dataset['Sex'] == 'female', 'Sex'] = 1
    
combine = [train_df, test_df]
train_df.head()
test_df.head()

# train_df = train_df.drop(['Sex'], axis=1)
# test_df = test_df.drop(['Sex'], axis=1)
combine = [train_df, test_df]

In [565]:
train_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'].fillna('S', inplace=True)

test_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,Ticket,Fare,Embarked,FamilySize
0,892,3,0,4,330911,7.8292,Q,1
1,893,3,1,5,363272,7.0000,S,2
2,894,2,0,7,240276,9.6875,Q,1
3,895,3,0,3,315154,8.6625,S,1
4,896,3,1,2,3101298,12.2875,S,3
5,897,3,0,1,7538,9.2250,S,1
6,898,3,1,3,330972,7.6292,Q,1
7,899,2,0,3,248738,29.0000,S,3
8,900,3,1,2,2657,7.2292,C,1
9,901,3,0,2,A/4 48871,24.1500,S,3


In [566]:

for dataset in combine:
    dataset.loc[dataset['Embarked'] == 'Q', 'Embarked'] = 0
    dataset.loc[dataset['Embarked'] == 'S', 'Embarked'] = 1
    dataset.loc[dataset['Embarked'] == 'C', 'Embarked'] = 2
    
combine = [train_df, test_df]
train_df.head()
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Ticket,Fare,Embarked,FamilySize
0,892,3,0,4,330911,7.8292,0,1
1,893,3,1,5,363272,7.0,1,2
2,894,2,0,7,240276,9.6875,0,1
3,895,3,0,3,315154,8.6625,1,1
4,896,3,1,2,3101298,12.2875,1,3


In [567]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 20)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.225]",0.109091
1,"(7.225, 7.55]",0.189189
2,"(7.55, 7.75]",0.333333
3,"(7.75, 7.854]",0.25641
4,"(7.854, 7.91]",0.113636
5,"(7.91, 8.05]",0.225806
6,"(8.05, 9.0]",0.142857
7,"(9.0, 10.5]",0.28
8,"(10.5, 13.0]",0.461538
9,"(13.0, 14.454]",0.315789


In [568]:
train_df['Fare'].fillna(train_df['Fare'].dropna().median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

for dataset in combine:
    dataset.loc[dataset['Fare'] <= 10, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 10) & (dataset['Fare'] <= 20), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 20) & (dataset['Fare'] <= 30), 'Fare'] = 2
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 40), 'Fare'] = 3
    dataset.loc[(dataset['Fare'] > 40) & (dataset['Fare'] <= 50), 'Fare'] = 4
    dataset.loc[(dataset['Fare'] > 50) & (dataset['Fare'] <= 70), 'Fare'] = 5
    dataset.loc[(dataset['Fare'] > 70) & (dataset['Fare'] <= 90), 'Fare'] = 6
    dataset.loc[(dataset['Fare'] > 90) & (dataset['Fare'] <= 100), 'Fare'] = 7
    dataset.loc[dataset['Fare'] > 100, 'Fare'] = 8
    dataset['Fare'] = dataset['Fare'].astype(int)
    
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,FamilySize,FareBand
0,1,0,3,0,2,A/5 21171,0,1,2,"(7.225, 7.55]"
1,2,1,1,1,4,PC 17599,6,2,2,"(56.496, 77.958]"
2,3,1,3,1,3,STON/O2. 3101282,0,1,1,"(7.91, 8.05]"
3,4,1,1,1,4,113803,5,1,2,"(39.688, 56.496]"
4,5,0,3,0,4,373450,0,1,1,"(7.91, 8.05]"


In [569]:
train_df = train_df.drop(['FareBand'], axis=1)

combine = [train_df, test_df]

In [570]:
for dataset in combine:
    dataset.loc[dataset['PassengerId'] <= 50, 'PassengerIndex'] = 0
    dataset.loc[(dataset['PassengerId'] > 50) & (dataset['PassengerId'] <= 100), 'PassengerIndex'] = 1
    dataset.loc[(dataset['PassengerId'] > 100) & (dataset['PassengerId'] <= 150), 'PassengerIndex'] = 2
    dataset.loc[(dataset['PassengerId'] > 150) & (dataset['PassengerId'] <= 200), 'PassengerIndex'] = 3
    dataset.loc[(dataset['PassengerId'] > 200) & (dataset['PassengerId'] <= 300), 'PassengerIndex'] = 4
    dataset.loc[(dataset['PassengerId'] > 300) & (dataset['PassengerId'] <= 400), 'PassengerIndex'] = 5
    dataset.loc[(dataset['PassengerId'] > 400) & (dataset['PassengerId'] <= 500), 'PassengerIndex'] = 6
    dataset.loc[(dataset['PassengerId'] > 500) & (dataset['PassengerId'] <= 600), 'PassengerIndex'] = 7
    dataset.loc[(dataset['PassengerId'] > 600) & (dataset['PassengerId'] <= 700), 'PassengerIndex'] = 8
    dataset.loc[(dataset['PassengerId'] > 700) & (dataset['PassengerId'] <= 800), 'PassengerIndex'] = 9
    dataset.loc[(dataset['PassengerId'] > 800) & (dataset['PassengerId'] <= 900), 'PassengerIndex'] = 10
    dataset.loc[dataset['PassengerId'] > 900, 'PassengerIndex'] = 11
    dataset['PassengerIndex'] = dataset['PassengerIndex'].astype(int)

combine = [train_df, test_df]
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,FamilySize,PassengerIndex
0,1,0,3,0,2,A/5 21171,0,1,2,0
1,2,1,1,1,4,PC 17599,6,2,2,0
2,3,1,3,1,3,STON/O2. 3101282,0,1,1,0
3,4,1,1,1,4,113803,5,1,2,0
4,5,0,3,0,4,373450,0,1,1,0
5,6,0,3,0,3,330877,0,0,1,0
6,7,0,1,0,6,17463,5,1,1,0
7,8,0,3,0,0,349909,2,1,5,0
8,9,1,3,1,3,347742,1,1,3,0
9,10,1,2,1,1,237736,3,2,2,0


In [571]:
train_ticket = train_df['Ticket'].str.split(' ', n=1, expand=True)
test_ticket = test_df['Ticket'].str.split(' ', n=1, expand=True)
for i in range(0, len(train_ticket)):
    if not train_ticket[1][i]:
        train_ticket[1][i] = train_ticket[0][i]
print(train_ticket)    
for i in range(0, len(test_ticket)):
    if not test_ticket[1][i]:
        test_ticket[1][i] = test_ticket[0][i]
print(test_ticket)    

              0        1
0           A/5    21171
1            PC    17599
2      STON/O2.  3101282
3        113803   113803
4        373450   373450
5        330877   330877
6         17463    17463
7        349909   349909
8        347742   347742
9        237736   237736
10           PP     9549
11       113783   113783
12         A/5.     2151
13       347082   347082
14       350406   350406
15       248706   248706
16       382652   382652
17       244373   244373
18       345763   345763
19         2649     2649
20       239865   239865
21       248698   248698
22       330923   330923
23       113788   113788
24       349909   349909
25       347077   347077
26         2631     2631
27        19950    19950
28       330959   330959
29       349216   349216
..          ...      ...
861       28134    28134
862       17466    17466
863         CA.     2343
864      233866   233866
865      236852   236852
866    SC/PARIS     2149
867          PC    17590
868      345777   345777


In [575]:
train_df['Ticket'] = train_ticket[1]
test_df['Ticket'] = test_ticket[1]
combine = [train_df, test_df]

In [591]:
# for dataset in combine:
#     print(dataset['Ticket'])
#     dataset['Ticket'] = dataset['Ticket'].astype('int64')
    
train_df['Ticket'].fillna(int(round(train_df['Ticket'].dropna().median())), inplace=True)
test_df['Ticket'].fillna(int(round(test_df['Ticket'].dropna().median())), inplace=True)


0        21171
1        17599
2      3101282
3       113803
4       373450
5       330877
6        17463
7       349909
8       347742
9       237736
10        9549
11      113783
12        2151
13      347082
14      350406
15      248706
16      382652
17      244373
18      345763
19        2649
20      239865
21      248698
22      330923
23      113788
24      349909
25      347077
26        2631
27       19950
28      330959
29      349216
        ...   
861      28134
862      17466
863       2343
864     233866
865     236852
866       2149
867      17590
868     345777
869     347742
870     349248
871      11751
872        695
873     345765
874       3381
875       2667
876       7534
877     349212
878     349217
879      11767
880     230433
881     349257
882       7552
883      34068
884     392076
885     382652
886     211536
887     112053
888       6607
889     111369
890     370376
Name: Ticket, Length: 891, dtype: object


ValueError: invalid literal for int() with base 10: '2. 3101294'

In [545]:
print(train_df['Ticket'])

0        21171
1        17599
2      3101282
3       113803
4       373450
5       330877
6        17463
7       349909
8       347742
9       237736
10        9549
11      113783
12        2151
13      347082
14      350406
15      248706
16      382652
17      244373
18      345763
19        2649
20      239865
21      248698
22      330923
23      113788
24      349909
25      347077
26        2631
27       19950
28      330959
29      349216
        ...   
861      28134
862      17466
863       2343
864     233866
865     236852
866       2149
867      17590
868     345777
869     347742
870     349248
871      11751
872        695
873     345765
874       3381
875       2667
876       7534
877     349212
878     349217
879      11767
880     230433
881     349257
882       7552
883      34068
884     392076
885     382652
886     211536
887     112053
888       6607
889     111369
890     370376
Name: Ticket, Length: 891, dtype: object


In [548]:
Y_train = train_df['Survived']
X_train = train_df.drop(['Survived', 'PassengerId', 'PassengerIndex'], axis=1)
X_test = test_df.drop(['PassengerId', 'PassengerIndex'], axis=1)
X_train.shape, Y_train.shape, X_test.shape
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass        891 non-null int64
Sex           891 non-null int64
Age           891 non-null int64
Ticket        891 non-null object
Fare          891 non-null int64
Embarked      891 non-null int64
FamilySize    891 non-null int64
dtypes: int64(6), object(1)
memory usage: 48.8+ KB


In [547]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

ValueError: could not convert string to float: '2. 3101288'

In [None]:
result = list()
for i, preds in enumerate(Y_pred):
    result.append([Y_test[i], preds])
    
sub = pd.DataFrame(result, columns=['PassengerId', 'Survived'])

In [None]:
sub.to_csv('sample_submission.csv', index=False)