In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv("train_test_data/train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
dataset["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [3]:
dataset["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [4]:
dataset["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
dataset.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
dataset.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


### So, what was noticed from the initial analysis of this dataset:
1. The most significantly correlated to the "Survived" column are "Pclass" and "Fare" columns;
2. There are some columns, that are categorical and not numerical. So, I guess, we will have to transform them in a way, they will become numerical, or unused (like, "Name") in case a column does not make any sense (entirely);
3. We can apply data engineering to some columns to get more data from them.

# The columns are:
1. "Cabin" - can be separated into number and prefix letter;
2. "Ticket" - can be separated into prefix and number as well;
3. "Name" - we will separate it by the name, surname and title.

In [8]:
dataset_original = dataset

In [9]:
dataset = dataset.drop(["PassengerId"], axis=1)

In [10]:
dataset.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
surnames = dataset["Name"].to_list()
surnames = [surname.split(",")[0] for surname in surnames]
surnames[:20]

['Braund',
 'Cumings',
 'Heikkinen',
 'Futrelle',
 'Allen',
 'Moran',
 'McCarthy',
 'Palsson',
 'Johnson',
 'Nasser',
 'Sandstrom',
 'Bonnell',
 'Saundercock',
 'Andersson',
 'Vestrom',
 'Hewlett',
 'Rice',
 'Williams',
 'Vander Planke',
 'Masselmani']

In [12]:
dataset["Surname"] = surnames

In [13]:
dataset["Name"] = [name.split(",")[1] for name in dataset["Name"].to_list()]

In [14]:
dataset.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname
0,0,3,Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,1,1,Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,1,3,Miss. Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,1,1,Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,0,3,Mr. William Henry,male,35.0,0,0,373450,8.05,,S,Allen


In [15]:
dataset["Title"] = [name.split(".")[0] for name in dataset["Name"].to_list()]
dataset["Name"] = [name.split(".")[1] for name in dataset["Name"].to_list()]
dataset.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Title
0,0,3,Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr
1,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs
2,1,3,Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss
3,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs
4,0,3,William Henry,male,35.0,0,0,373450,8.05,,S,Allen,Mr


In [16]:
dataset["Title"].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Major             2
 Mlle              2
 Col               2
 Sir               1
 Jonkheer          1
 Capt              1
 the Countess      1
 Ms                1
 Don               1
 Lady              1
 Mme               1
Name: Title, dtype: int64

In [17]:
tickets = []
prefixes = []

for ticket in dataset["Ticket"].to_list():
    if len(ticket.split()) == 2:
        tickets.append(int(ticket.split()[1]))
        prefixes.append(ticket.split()[0])
    elif len(ticket.split()) == 3:
        prefixes.append(ticket.split()[0] + ticket.split()[1])
        tickets.append(int(ticket.split()[2]))
    else:
        try:
            tickets.append(int(ticket))
            prefixes.append(None)
        except ValueError:
            prefixes.append(ticket)
            tickets.append(None)

In [18]:
tickets

[21171,
 17599,
 3101282,
 113803,
 373450,
 330877,
 17463,
 349909,
 347742,
 237736,
 9549,
 113783,
 2151,
 347082,
 350406,
 248706,
 382652,
 244373,
 345763,
 2649,
 239865,
 248698,
 330923,
 113788,
 349909,
 347077,
 2631,
 19950,
 330959,
 349216,
 17601,
 17569,
 335677,
 24579,
 17604,
 113789,
 2677,
 2152,
 345764,
 2651,
 7546,
 11668,
 349253,
 2123,
 330958,
 23567,
 370371,
 14311,
 2662,
 349237,
 3101295,
 39886,
 17572,
 2926,
 113509,
 19947,
 31026,
 2697,
 34651,
 2144,
 2669,
 113572,
 36973,
 347088,
 17605,
 2661,
 29395,
 3464,
 3101281,
 315151,
 33111,
 2144,
 14879,
 2680,
 1601,
 348123,
 349208,
 374746,
 248738,
 364516,
 345767,
 345779,
 330932,
 113059,
 14885,
 3101278,
 6608,
 392086,
 19950,
 343275,
 343276,
 347466,
 5734,
 2315,
 364500,
 374910,
 17754,
 17759,
 231919,
 244367,
 349245,
 349215,
 35281,
 7540,
 3101276,
 349207,
 343120,
 312991,
 349249,
 371110,
 110465,
 2665,
 324669,
 4136,
 2627,
 3101294,
 370369,
 11668,
 17558,
 34

In [19]:
len(prefixes)

891

In [20]:
dataset["Prefix"] = prefixes
dataset.head(20)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Title,Prefix
0,0,3,Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr,A/5
1,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,PC
2,1,3,Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss,STON/O2.
3,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs,
4,0,3,William Henry,male,35.0,0,0,373450,8.05,,S,Allen,Mr,
5,0,3,James,male,,0,0,330877,8.4583,,Q,Moran,Mr,
6,0,1,Timothy J,male,54.0,0,0,17463,51.8625,E46,S,McCarthy,Mr,
7,0,3,Gosta Leonard,male,2.0,3,1,349909,21.075,,S,Palsson,Master,
8,1,3,Oscar W (Elisabeth Vilhelmina Berg),female,27.0,0,2,347742,11.1333,,S,Johnson,Mrs,
9,1,2,Nicholas (Adele Achem),female,14.0,1,0,237736,30.0708,,C,Nasser,Mrs,


In [21]:
dataset["Ticket"] = tickets

In [22]:
dataset.head(20)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Title,Prefix
0,0,3,Owen Harris,male,22.0,1,0,21171.0,7.25,,S,Braund,Mr,A/5
1,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,17599.0,71.2833,C85,C,Cumings,Mrs,PC
2,1,3,Laina,female,26.0,0,0,3101282.0,7.925,,S,Heikkinen,Miss,STON/O2.
3,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803.0,53.1,C123,S,Futrelle,Mrs,
4,0,3,William Henry,male,35.0,0,0,373450.0,8.05,,S,Allen,Mr,
5,0,3,James,male,,0,0,330877.0,8.4583,,Q,Moran,Mr,
6,0,1,Timothy J,male,54.0,0,0,17463.0,51.8625,E46,S,McCarthy,Mr,
7,0,3,Gosta Leonard,male,2.0,3,1,349909.0,21.075,,S,Palsson,Master,
8,1,3,Oscar W (Elisabeth Vilhelmina Berg),female,27.0,0,2,347742.0,11.1333,,S,Johnson,Mrs,
9,1,2,Nicholas (Adele Achem),female,14.0,1,0,237736.0,30.0708,,C,Nasser,Mrs,


In [23]:
dataset.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,-0.0964,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,0.286279,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,-0.107166,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.046018,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,-0.033529,0.216225
Ticket,-0.0964,0.286279,-0.107166,0.046018,-0.033529,1.0,-0.156916
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,-0.156916,1.0


In [24]:
dataset.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        4
Fare          0
Cabin       687
Embarked      2
Surname       0
Title         0
Prefix      661
dtype: int64

### Looking on this statistics, we can figure out, that such features as "Prefix" and "Cabin" most likely will be useless in our model, as they are not available in more than a half of records. So we can delete a few features, that are useless for the model training. Such as:
* Name;
* Cabin;
* Prefix.

In [25]:
dataset = dataset.drop(["Name", "Cabin", "Prefix", "Surname"], axis=1)
dataset.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,0,3,male,22.0,1,0,21171.0,7.25,S,Mr
1,1,1,female,38.0,1,0,17599.0,71.2833,C,Mrs
2,1,3,female,26.0,0,0,3101282.0,7.925,S,Miss
3,1,1,female,35.0,1,0,113803.0,53.1,S,Mrs
4,0,3,male,35.0,0,0,373450.0,8.05,S,Mr
5,0,3,male,,0,0,330877.0,8.4583,Q,Mr
6,0,1,male,54.0,0,0,17463.0,51.8625,S,Mr
7,0,3,male,2.0,3,1,349909.0,21.075,S,Master
8,1,3,female,27.0,0,2,347742.0,11.1333,S,Mrs
9,1,2,female,14.0,1,0,237736.0,30.0708,C,Mrs


## Let's transform columns of a dataset, that are categorical, using OneHotEncoder

In [26]:
dataset.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        4
Fare          0
Embarked      2
Title         0
dtype: int64

In [27]:
dataset["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

embarked_imputer = SimpleImputer(strategy="constant", fill_value="S")
age_imputer = SimpleImputer(strategy="median")

embarked_features = ["Embarked"]
age_features = ["Age"]

imputer = ColumnTransformer([
    ("embarked_imputer", embarked_imputer, embarked_features),
    ("age_imputer", age_imputer, age_features),
])

filled_dataset = imputer.fit_transform(dataset)
filled_dataset

array([['S', 22.0],
       ['C', 38.0],
       ['S', 26.0],
       ...,
       ['S', 28.0],
       ['C', 26.0],
       ['Q', 32.0]], dtype=object)

In [29]:
dataset["Embarked"] = filled_dataset[:, 0]
dataset["Age"] = filled_dataset[:, 1]
dataset.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,0,3,male,22,1,0,21171.0,7.25,S,Mr
1,1,1,female,38,1,0,17599.0,71.2833,C,Mrs
2,1,3,female,26,0,0,3101282.0,7.925,S,Miss
3,1,1,female,35,1,0,113803.0,53.1,S,Mrs
4,0,3,male,35,0,0,373450.0,8.05,S,Mr
5,0,3,male,28,0,0,330877.0,8.4583,Q,Mr
6,0,1,male,54,0,0,17463.0,51.8625,S,Mr
7,0,3,male,2,3,1,349909.0,21.075,S,Master
8,1,3,female,27,0,2,347742.0,11.1333,S,Mrs
9,1,2,female,14,1,0,237736.0,30.0708,C,Mrs


In [30]:
dataset.dropna(inplace=True)

In [31]:
X = dataset.drop(["Survived"], axis=1)
y = dataset["Survived"]

In [32]:
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
Title       0
dtype: int64

### Transformaing

dataset = dataset.drop(["PassengerId"], axis=1)

surnames = dataset["Name"].to_list()
surnames = [surname.split(",")[0] for surname in surnames]
surnames[:20]

dataset["Surname"] = surnames
dataset["Name"] = [name.split(",")[1] for name in dataset["Name"].to_list()]
dataset["Title"] = [name.split(".")[0] for name in dataset["Name"].to_list()]
dataset["Name"] = [name.split(".")[1] for name in dataset["Name"].to_list()]

tickets = []
prefixes = []

for ticket in dataset["Ticket"].to_list():
    if len(ticket.split()) == 2:
        tickets.append(int(ticket.split()[1]))
        prefixes.append(ticket.split()[0])
    elif len(ticket.split()) == 3:
        prefixes.append(ticket.split()[0] + ticket.split()[1])
        tickets.append(int(ticket.split()[2]))
    else:
        try:
            tickets.append(int(ticket))
            prefixes.append(None)
        except ValueError:
            prefixes.append(ticket)
            tickets.append(None)
            
dataset["Prefix"] = prefixes
dataset["Ticket"] = tickets
dataset = dataset.drop(["Name", "Cabin", "Prefix", "Surname"], axis=1)

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

embarked_imputer = SimpleImputer(strategy="constant", fill_value="S")
age_imputer = SimpleImputer(strategy="median")

embarked_features = ["Embarked"]
age_features = ["Age"]

imputer = ColumnTransformer([
    ("embarked_imputer", embarked_imputer, embarked_features),
    ("age_imputer", age_imputer, age_features),
])

filled_X = imputer.fit_transform(X)
filled_X

X["Embarked"] = filled_X[:, 0]
X["Age"] = filled_X[:, 1]

X.dropna(inplace=True)

from sklearn.preprocessing import OneHotEncoder

categorical_features = ["Sex", "Embarked", "Title"]

onehot = OneHotEncoder()
transformer = ColumnTransformer([("onehot",
                                  onehot,
                                  categorical_features)],
                                  remainder="passthrough")
dataset_transformed = transformer.fit_transform(X)

In [33]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ["Sex", "Embarked", "Title"]

onehot = OneHotEncoder()
transformer = ColumnTransformer([("onehot",
                                  onehot,
                                  categorical_features)],
                                  remainder="passthrough")
dataset_transformed = transformer.fit_transform(X)

In [34]:
X = dataset_transformed

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)