In [104]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [105]:
train_df = pd.read_csv('../csvs/train.csv')
test_df = pd.read_csv('../csvs/test.csv')

In [106]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [107]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [108]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [109]:
# We can also describe categorical data
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,1601,G6,S
freq,1,577,7,4,644


In [110]:
train_df.groupby(train_df["Pclass"], as_index=False)["Survived"].mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [111]:
train_df.groupby(train_df["Sex"], as_index=False)["Survived"].mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [112]:
train_df.groupby(train_df["SibSp"], as_index=False)["Survived"].mean()

Unnamed: 0,SibSp,Survived
0,0,0.345395
1,1,0.535885
2,2,0.464286
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [113]:
train_df.groupby(train_df["Parch"], as_index=False)["Survived"].mean()

Unnamed: 0,Parch,Survived
0,0,0.343658
1,1,0.550847
2,2,0.5
3,3,0.6
4,4,0.0
5,5,0.2
6,6,0.0


In [114]:
train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"] + 1
train_df.groupby(train_df["FamilySize"], as_index=False)["Survived"].mean()


Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


In [115]:
family_map = {1: "Alone", 2: "Small", 3: "Small", 4: "Small", 5: "Medium", 6: "Medium", 7: "Large", 8: "Large", 9: "Large", 10: "Large"}
train_df["FamilyGroupSize"] = train_df["FamilySize"].map(family_map)
test_df["FamilyGroupSize"] = test_df["FamilySize"].map(family_map)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FamilyGroupSize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,Small
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,Small
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Alone
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,Small
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,Alone


In [116]:
train_df.groupby(train_df["Embarked"], as_index=False)["Survived"].mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [117]:
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
age_cut = pd.cut(train_df["Age"], bins=age_bins)
train_df.groupby(age_cut, as_index=False)["Survived"].mean()

Unnamed: 0,Survived
0,0.59375
1,0.382609
2,0.365217
3,0.445161
4,0.383721
5,0.404762
6,0.235294
7,0.2


In [118]:
fare_bins = [0, 100, 200, 300, 400, 500, 600]
fare_cut = pd.cut(train_df["Fare"], bins=fare_bins)
train_df.groupby(fare_cut, as_index=False)['Survived'].mean()

Unnamed: 0,Survived
0,0.36695
1,0.757576
2,0.647059
3,
4,
5,1.0


In [119]:
train_df["Name"]

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [120]:
train_df["Title"] = train_df["Name"].str.split(",", expand=True)[1].str.split(".", expand=True)[0].apply(lambda x : x.strip())
test_df["Title"] = test_df["Name"].str.split(",", expand=True)[1].str.split(".", expand=True)[0].apply(lambda x : x.strip())

In [121]:
train_df.groupby(train_df["Title"])["Survived"].agg(["count", "mean"])

Unnamed: 0_level_0,count,mean
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0.0
Col,2,0.5
Don,1,0.0
Dr,7,0.428571
Jonkheer,1,0.0
Lady,1,1.0
Major,2,0.5
Master,40,0.575
Miss,182,0.697802
Mlle,2,1.0


- military - Capt, Col, Major
- noble - Jonkheer, the Countess, Don, Lady, Sir
- unmarried Female - Mlle, Ms, Mme

In [122]:
title_category = {"Capt": "military", "Col": "military", "Major": "military", "Jonkheer": "noble", "the Countess": "noble", "Don": "noble", "Lady": "noble", "Sir": "noble", "Mlle": "unmarried Female", "Ms": "unmarried Female", "Mme": "unmarried Female"}
train_df["Title"] = train_df["Title"].replace(title_category)
test_df["Title"] = test_df["Title"].replace(title_category)

In [123]:
train_df.groupby(train_df["Title"])["Survived"].agg(["count", 'mean'])

Unnamed: 0_level_0,count,mean
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Dr,7,0.428571
Master,40,0.575
Miss,182,0.697802
Mr,517,0.156673
Mrs,125,0.792
Rev,6,0.0
military,5,0.4
noble,5,0.6
unmarried Female,4,1.0


In [124]:
train_df["Ticket"]

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [125]:
train_df["TicketNumber"] = train_df["Ticket"].apply(lambda x: x.split()[-1])
test_df["TicketNumber"] = test_df["Ticket"].apply(lambda x: x.split()[-1])
train_df['TicketNumber']

0        21171
1        17599
2      3101282
3       113803
4       373450
        ...   
886     211536
887     112053
888       6607
889     111369
890     370376
Name: TicketNumber, Length: 891, dtype: object

In [126]:
train_df.groupby(["TicketNumber"], as_index=False)["Survived"].agg(['count', 'mean']).sort_values('count', ascending=False)


Unnamed: 0,TicketNumber,count,mean
196,2343,7,0.000000
94,1601,7,0.714286
464,347082,7,0.000000
468,347088,6,0.000000
358,3101295,6,0.000000
...,...,...,...
55,1166,1,0.000000
311,28424,1,0.000000
312,28425,1,0.000000
313,28551,1,1.000000


In [127]:
train_df["TicketNumberCounts"] = train_df.groupby(['TicketNumber'])["TicketNumber"].transform('count')
test_df["TicketNumberCounts"] = test_df.groupby(['TicketNumber'])["TicketNumber"].transform('count')

In [128]:
train_df.groupby(["TicketNumberCounts"], as_index=True)['Survived'].agg(['count', 'mean']).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean
TicketNumberCounts,Unnamed: 1_level_1,Unnamed: 2_level_1
1,544,0.295956
2,188,0.569149
3,66,0.712121
4,44,0.5
7,21,0.238095
6,18,0.0
5,10,0.0


In [129]:
train_df["Ticket"]

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [130]:
train_df["TicketPrefix"] = np.where(train_df["Ticket"].str.split(' ', expand=True, n=1)[1].notna(), train_df["Ticket"].str.split(' ', expand=True, n=1)[0].apply(lambda x: x.strip()), "Blank")
test_df["TicketPrefix"] = np.where(test_df["Ticket"].str.split(' ', expand=True, n=1)[1].notna(), test_df["Ticket"].str.split(' ', expand=True, n=1)[0].apply(lambda x: x.strip()), "Blank")

In [131]:
train_df["TicketPrefix"].value_counts()

TicketPrefix
Blank         665
PC             60
C.A.           27
STON/O         12
A/5            10
W./C.           9
CA.             8
SOTON/O.Q.      8
A/5.            7
SOTON/OQ        7
STON/O2.        6
CA              6
C               5
S.O.C.          5
SC/PARIS        5
F.C.C.          5
SC/Paris        4
A/4.            3
PP              3
A/4             3
S.O./P.P.       3
SC/AH           3
A./5.           2
P/PP            2
A.5.            2
WE/P            2
SOTON/O2        2
S.C./PARIS      2
S.C./A.4.       1
Fa              1
S.O.P.          1
SO/C            1
S.P.            1
A4.             1
W.E.P.          1
A/S             1
SC              1
SW/PP           1
SCO/W           1
W/C             1
S.W./PP         1
F.C.            1
C.A./SOTON      1
Name: count, dtype: int64

In [132]:
train_df["TicketLocation"] = train_df["TicketPrefix"].replace({
    'SOTON/O.Q': 'SOTON/OQ',
    'C.A': 'CA',
    'CA.': 'CA',
    'S.C./PARIS': 'SC/Paris',
    'S.C./PARIS': 'SC/Paris',
    'A/4': 'A/4',
    'A/5': 'A/5',
    'A/5.': 'A/5',
    'A./5': 'A/5',
    'W./C.': 'W/C'
})
test_df["TicketLocation"] = test_df["TicketPrefix"].replace({
    'SOTON/O.Q': 'SOTON/OQ',
    'C.A': 'CA',
    'CA.': 'CA',
    'S.C./PARIS': 'SC/Paris',
    'S.C./PARIS': 'SC/Paris',
    'A/4': 'A/4',
    'A/5': 'A/5',
    'A/5.': 'A/5',
    'A./5': 'A/5',
    'W./C.': 'W/C'
})

In [133]:
train_df.groupby(train_df["TicketLocation"])["Survived"].agg(['count', 'mean']).count()

count    39
mean     39
dtype: int64

In [138]:
train_df["Cabin"]

0         U
1       C85
2         U
3      C123
4         U
       ... 
886       U
887     B42
888       U
889    C148
890       U
Name: Cabin, Length: 891, dtype: object

In [134]:
train_df["Cabin"].isnull().sum()

np.int64(687)

In [135]:
train_df["Cabin"].fillna("U", inplace=True)

In [149]:
split_cabin = train_df["Cabin"].str.split(' ', expand=True)[0]
split_cabin_test = test_df["Cabin"].str.split(' ', expand=True)[0]
split_cabin.str[0].value_counts()

0
U    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [150]:
train_df["Cabin"] = split_cabin.str[0]
test_df["Cabin"] = split_cabin_test.str[0]
train_df["Cabin"]

0      U
1      C
2      U
3      C
4      U
      ..
886    U
887    B
888    U
889    C
890    U
Name: Cabin, Length: 891, dtype: object

In [151]:
train_df["Cabin"].value_counts()

Cabin
U    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [152]:
train_df.groupby(train_df["Cabin"], as_index=True)["Survived"].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,15,0.466667
B,47,0.744681
C,59,0.59322
D,33,0.757576
E,32,0.75
F,13,0.615385
G,4,0.5
T,1,0.0
U,687,0.299854


In [154]:
train_df["Cabin_Assigned"] = train_df["Cabin"].apply(lambda x: 0 if x in ['U'] else 1)
test_df["Cabin_Assigned"] = train_df["Cabin"].apply(lambda x: 0 if x in ['U'] else 1)
train_df["Cabin_Assigned"]

0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Cabin_Assigned, Length: 891, dtype: int64

In [155]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,FamilyGroupSize,Title,TicketNumber,TicketNumberCounts,TicketPrefix,TicketLocation,Cabin_Assigned
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,U,S,2,Small,Mr,21171,1,A/5,A/5,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,2,Small,Mrs,17599,1,PC,PC,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,U,S,1,Alone,Miss,3101282,1,STON/O2.,STON/O2.,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S,2,Small,Mrs,113803,2,Blank,Blank,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,U,S,1,Alone,Mr,373450,1,Blank,Blank,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,U,S,1,Alone,Rev,211536,1,Blank,Blank,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B,S,1,Alone,Miss,112053,1,Blank,Blank,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,U,S,4,Small,Miss,6607,2,W./C.,W/C,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,C,1,Alone,Mr,111369,1,Blank,Blank,1
