In [157]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('../datasets/titanic/train.csv', delimiter=',')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [5]:
data['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [6]:
data['Sex'].unique()

array(['male', 'female'], dtype=object)

In [7]:
data['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [8]:
data['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [9]:
data['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [10]:
data['Ticket'].head()

0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [11]:
data['Fare'].head()

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64

In [12]:
data['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [13]:
data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [14]:
data.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [15]:
data.shape

(891, 12)

In [16]:
sum(data['Survived'].isna().astype(int))

0

In [17]:
data_clean = data.dropna()

In [18]:
data_clean.shape

(183, 12)

In [250]:
data_num = data[['Pclass','Age','SibSp','Parch','Fare']]
data_num.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


In [251]:
data_txt_cat = data[['Sex','Embarked']]
data_txt_cat.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [252]:
imputer_txt = SimpleImputer(strategy='most_frequent')

In [253]:
imputer_txt.fit(data_txt_cat)

In [254]:
data_txt_cat_imp = imputer_txt.transform(data_txt_cat)
data_txt_cat_imp

array([['male', 'S'],
       ['female', 'C'],
       ['female', 'S'],
       ...,
       ['female', 'S'],
       ['male', 'C'],
       ['male', 'Q']], dtype=object)

In [255]:
one_hot_encoded = OneHotEncoder()
data_hot_encoded = one_hot_encoded.fit_transform(data_txt_cat_imp)

In [256]:
data_hot_encoded = data_hot_encoded.toarray()

In [257]:
data_hot_encoded

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [258]:
data_hot_encoded.shape

(891, 5)

In [259]:
imputer_num = SimpleImputer(strategy='median')

In [260]:
imputer_num.fit(data_num)

In [261]:
data_num_imp = imputer_num.transform(data_num)

In [262]:
data_num_imp

array([[ 3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [ 3.    , 28.    ,  1.    ,  2.    , 23.45  ],
       [ 1.    , 26.    ,  0.    ,  0.    , 30.    ],
       [ 3.    , 32.    ,  0.    ,  0.    ,  7.75  ]])

In [263]:
scaler = StandardScaler()

In [264]:
age = data_num_imp[:,1]
fare = data_num_imp[:,-1]
age_fare = np.c_[age,fare]
age_fare

array([[22.    ,  7.25  ],
       [38.    , 71.2833],
       [26.    ,  7.925 ],
       ...,
       [28.    , 23.45  ],
       [26.    , 30.    ],
       [32.    ,  7.75  ]])

In [265]:
scaler.fit(age_fare)

In [278]:
age_fare_std = scaler.transform(age_fare)
age_fare_std

array([[-0.56573646, -0.50244517],
       [ 0.66386103,  0.78684529],
       [-0.25833709, -0.48885426],
       ...,
       [-0.1046374 , -0.17626324],
       [-0.25833709, -0.04438104],
       [ 0.20276197, -0.49237783]])

In [267]:
data_std = np.c_[data_num_imp[:,0], age_fare_std[:,1], data_num_imp[:,2:-1], age_fare_std[:,-1]]
data_std

array([[ 3.        , -0.50244517,  1.        ,  0.        , -0.50244517],
       [ 1.        ,  0.78684529,  1.        ,  0.        ,  0.78684529],
       [ 3.        , -0.48885426,  0.        ,  0.        , -0.48885426],
       ...,
       [ 3.        , -0.17626324,  1.        ,  2.        , -0.17626324],
       [ 1.        , -0.04438104,  0.        ,  0.        , -0.04438104],
       [ 3.        , -0.49237783,  0.        ,  0.        , -0.49237783]])

In [268]:
data_final = np.c_[data_std, data_hot_encoded, data['Survived'].to_numpy()]
data_final

array([[ 3.        , -0.50244517,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.78684529,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        , -0.48885426,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 3.        , -0.17626324,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -0.04438104,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        , -0.49237783,  0.        , ...,  1.        ,
         0.        ,  0.        ]])

In [269]:
data_train, data_test = train_test_split(data_final, random_state=42, test_size=0.2)

In [302]:
X_train, y_train = data_train[:,:-1], data_train[:,-1]
X_test, y_test = data_test[:,:-1], data_test[:,-1]

In [271]:
print(X_train[0], y_train[0])
print(X_test[0], y_test[0])

[ 1.         -0.07458307  0.          0.         -0.07458307  0.
  1.          0.          0.          1.        ] 0.0
[ 3.         -0.34145224  1.          1.         -0.34145224  0.
  1.          1.          0.          0.        ] 1.0


In [304]:
X_train.shape
X_test.shape

(179, 10)

##### Random Forest

In [305]:
forest_clf = RandomForestClassifier(random_state=42)

In [306]:
cv_forest = cross_val_score(forest_clf, X_train, y_train, cv=6, scoring='accuracy')
print(cv_forest)
print(np.mean(cv_forest))

[0.76470588 0.78151261 0.83193277 0.76470588 0.79661017 0.8220339 ]
0.7935835351089588


In [307]:
param_grid = [{'n_estimators': [5,9,10,11], 'max_features': [2,3,4,5],
              'bootstrap': [True, False]}]

grid_forest = GridSearchCV(forest_clf, param_grid, cv=5, scoring='accuracy')

In [308]:
grid_forest.fit(X_train, y_train)

In [309]:
grid_forest.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 10}

In [310]:
grid_forest.best_score_

0.8118290160543682

In [311]:
forest_clf_best = grid_forest.best_estimator_

In [312]:
cross_val_score(forest_clf_best, X_train, y_train, cv=10).mean()

0.8034233176838811

In [313]:
y_forest_pred = forest_clf_best.predict(X_test)

In [314]:
accuracy_score(y_test, y_forest_pred)

0.7988826815642458

##### KNeighbors

In [315]:
kn_clf = KNeighborsClassifier()

In [316]:
cv_kn = cross_val_score(kn_clf, X_train, y_train, cv=6, scoring='accuracy')
print(cv_kn)
print(np.mean(cv_kn))

[0.7394958  0.75630252 0.82352941 0.80672269 0.77118644 0.81355932]
0.7851326971466553


In [324]:
param_grid = [{'weights':['uniform', 'distance'], 'n_neighbors':[9,10,11],
               'algorithm':['auto','ball_tree','kd_tree','brute'], 'p':[1,2]}]
grid_kn = GridSearchCV(kn_clf, param_grid, cv=5, scoring='accuracy')

In [325]:
grid_kn.fit(X_train, y_train)

In [326]:
grid_kn.best_params_

{'algorithm': 'brute', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}

In [327]:
grid_kn.best_score_

0.7977740569289866

In [328]:
kn_clf_best = grid_kn.best_estimator_

In [329]:
y_kn_pred = kn_clf_best.predict(X_test)

In [330]:
accuracy_score(y_test, y_kn_pred)

0.7988826815642458