In [50]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [51]:
# PassengerId: Unique identifier for each passenger.
# Survived: Survival status of the passenger (0 = Not Survived, 1 = Survived).
# Pclass: Passenger class (1 = First class, 2 = Second class, 3 = Third class).
# Sex: Gender of the passenger.
# Age: Age of the passenger.
# SibSp: Number of siblings/spouses aboard the Titanic.
# Parch: Number of parents/children aboard the Titanic.
# Fare: Fare paid by the passenger.
# Embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).

In [52]:
x_y_train = pd.read_csv("titanic/train.csv", index_col="PassengerId")
x_y_train.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [53]:
x_y_train.keys(), len(x_y_train.keys())

(Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
        'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 11)

In [54]:
x_test = pd.read_csv("titanic/test.csv", index_col="PassengerId")
x_test

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [55]:
y_test = pd.read_csv("titanic/gender_submission.csv", index_col="PassengerId")
y_test

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [56]:
def drop_columns(df, *columns):
    for col in columns:
        df = df.drop(col, axis=1)
    
    return df

In [57]:
x_y_train.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [58]:
drop_columns(x_y_train, "Name", "Ticket", "Cabin")

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


In [59]:
x_test = x_test.pipe(drop_columns, "Name", "Ticket", "Cabin")
x_y_train = x_y_train.pipe(drop_columns, "Name", "Ticket", "Cabin")

In [60]:
from sklearn.preprocessing import LabelEncoder

In [61]:
def label_encode(df, *columns):
    for col in columns:
        df[col] = LabelEncoder().fit_transform(df[col])
    return df

In [62]:
x_test = x_test.pipe(label_encode, "Sex", "Embarked")
x_y_train = x_y_train.pipe(label_encode, "Sex", "Embarked")

In [64]:
x_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,34.5,0,0,7.8292,1
893,3,0,47.0,1,0,7.0,2
894,2,1,62.0,0,0,9.6875,1
895,3,1,27.0,0,0,8.6625,2
896,3,0,22.0,1,1,12.2875,2


In [65]:
x_y_train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.25,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.925,2
4,1,1,0,35.0,1,0,53.1,2
5,0,3,1,35.0,0,0,8.05,2


In [66]:
x_y_train.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208,1.538721
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,0.794231
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0,7.9104,1.0
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,3.0


In [67]:
x_y_train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [71]:
x_y_train.shape

(891, 8)

In [68]:
x_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [72]:
x_test.shape

(418, 7)

In [74]:
x_y_train[x_y_train["Age"].notnull()]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.2500,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.9250,2
4,1,1,0,35.0,1,0,53.1000,2
5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,3,0,39.0,0,5,29.1250,1
887,0,2,1,27.0,0,0,13.0000,2
888,1,1,0,19.0,0,0,30.0000,2
890,1,1,1,26.0,0,0,30.0000,0


In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:
tree_model = DecisionTreeClassifier()
tree_model

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [77]:
y =  x_y_train[["Survived"]]
x = x_y_train.drop(columns="Survived", axis = 1)

tree_model.fit(x, y)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [79]:
tree_model.score(x_test, y_test)

0.7607655502392344

In [80]:
from sklearn.ensemble import RandomForestClassifier

In [81]:
rfc_model = RandomForestClassifier()

In [83]:
print(rfc_model.__doc__)


A random forest classifier.

A random forest is a meta estimator that fits a number of decision tree
classifiers on various sub-samples of the dataset and uses averaging to
improve the predictive accuracy and control over-fitting.
Trees in the forest use the best split strategy, i.e. equivalent to passing
`splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeClassifier`.
The sub-sample size is controlled with the `max_samples` parameter if
`bootstrap=True` (default), otherwise the whole dataset is used to build
each tree.

For a comparison between tree-based ensemble models see the example
:ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`.

This estimator has native support for missing values (NaNs). During training,
the tree grower learns at each split point whether samples with missing values
should go to the left or right child, based on the potential gain. When predicting,
samples with missing values are assigned to the left or rig

In [84]:
rfc_model.fit(x, y)


  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [85]:
rfc_model.score(x_test, y_test)

0.8229665071770335

In [86]:
tree_model.score(x_test, y_test)

0.7607655502392344

In [88]:
from sklearn.neighbors import KNeighborsClassifier

In [90]:
not_null_x_y_train = x_y_train.dropna() 
not_null_x_y_train

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.2500,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.9250,2
4,1,1,0,35.0,1,0,53.1000,2
5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,3,0,39.0,0,5,29.1250,1
887,0,2,1,27.0,0,0,13.0000,2
888,1,1,0,19.0,0,0,30.0000,2
890,1,1,1,26.0,0,0,30.0000,0


In [96]:
x_y_test =  pd.concat([x_test, y_test], axis=1)
x_y_test = x_y_test.dropna()
x_y_test.head(2)


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,1,34.5,0,0,7.8292,1,0
893,3,0,47.0,1,0,7.0,2,1


In [97]:
y_test =  x_y_test["Survived"]
x_test = x_y_test.drop("Survived", axis=1)

In [98]:
knn_model = KNeighborsClassifier()

y =  not_null_x_y_train[["Survived"]]
x = not_null_x_y_train.drop(columns="Survived", axis = 1)

knn_model.fit(x, y)

knn_model.score(x_test, y_test)

  return self._fit(X, y)


0.6193353474320241

In [87]:
from sklearn.ensemble import VotingClassifier

In [99]:
voting_model = VotingClassifier(
    [
     ('knn', KNeighborsClassifier()),
     ('tree', DecisionTreeClassifier())]
)

In [100]:
voting_model.fit(x, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


0,1,2
,estimators,"[('knn', ...), ('tree', ...)]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [101]:
voting_model.score(x_test, y_test)

0.7129909365558912