In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train_data = pd.read_csv('../input/titanic/train.csv')

In [3]:
x_train = pd.read_csv('../input/titanic/train.csv')

In [4]:
x_test = pd.read_csv('../input/titanic/test.csv')

In [5]:
y_train = np.array(x_train['Survived'].copy())

In [6]:
id_test = np.array(x_test['PassengerId'].copy())

In [7]:
x_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
labels = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [9]:
x_train = x_train[labels]

In [10]:
x_test = x_test[labels]

In [11]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [12]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [13]:
x_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [15]:
num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy="most_frequent")),
    ('encode', OneHotEncoder())
])

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures

In [19]:
num_labels = ["Pclass", "Age", "SibSp", "Parch", "Fare"]


In [20]:
cat_labels = ["Sex", "Embarked"]

In [21]:
full_pipeline = ColumnTransformer([
    ("num_trans", num_pipeline, num_labels),
    ("cat_trans", cat_pipeline, cat_labels)
])

In [22]:
x_train_prepared = full_pipeline.fit_transform(x_train)

In [23]:
def model_fit_and_print_acc(model):
    model.fit(x_train_prepared, y_train)
    pred = model.predict(x_train_prepared) 
    print("Train accuracy: ", accuracy_score(pred, y_train))
    cv_scores = cross_val_score(model, x_train_prepared, y_train, cv=3, scoring="accuracy")
    print("Validation accuracy: ", cv_scores.mean())

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [25]:
log_reg = LogisticRegression()
model_fit_and_print_acc(log_reg)

Train accuracy:  0.7991021324354658
Validation accuracy:  0.7867564534231201


In [26]:
poly_log_reg = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
    ("log_reg", LogisticRegression(penalty='elasticnet', l1_ratio=0.7, max_iter=100000, solver='saga')),
])

In [27]:
model_fit_and_print_acc(poly_log_reg)

Train accuracy:  0.8338945005611672
Validation accuracy:  0.8148148148148149


In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
forest_clf = RandomForestClassifier()
model_fit_and_print_acc(forest_clf)

Train accuracy:  0.9797979797979798
Validation accuracy:  0.7934904601571269


In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
param_grid = [
    {'n_estimators':[150, 200, 250], 'max_features':[5, 10], 'max_depth':[5, 10, 20], 
     'min_samples_leaf':[5, 10, 20], 'min_samples_split':[5, 10, 20]}
]

In [32]:
forest_clf = RandomForestClassifier()


In [33]:
grid_search = GridSearchCV(forest_clf, param_grid, cv=3, scoring='accuracy', return_train_score=True)

In [34]:
grid_search.fit(x_train_prepared, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [5, 10, 20], 'max_features': [5, 10],
                          'min_samples_leaf': [5, 10, 20],
                          'min_samples_split': [5, 10, 20],
                          'n_estimators': [150, 200, 250]}],
             return_train_score=True, scoring='accuracy')

In [35]:
best_search_rf = grid_search.best_estimator_

In [36]:
print(best_search_rf)

RandomForestClassifier(max_depth=20, max_features=10, min_samples_leaf=5,
                       min_samples_split=5, n_estimators=150)


In [37]:
grid_search.best_params_

{'max_depth': 20,
 'max_features': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 150}

In [38]:
model_fit_and_print_acc(best_search_rf)

Train accuracy:  0.8866442199775533
Validation accuracy:  0.8294051627384961


In [39]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(20)
model_fit_and_print_acc(knn)

Train accuracy:  0.8249158249158249
Validation accuracy:  0.8159371492704827


In [40]:
from sklearn.ensemble import VotingClassifier

In [41]:
voting = VotingClassifier(
    estimators=[('poly_logres', poly_log_reg), ('grid_randomforest', best_search_rf), 
                ('knn', knn)],
    voting='hard'
)

In [42]:
voting.fit(x_train_prepared, y_train)

VotingClassifier(estimators=[('poly_logres',
                              Pipeline(steps=[('poly_features',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('log_reg',
                                               LogisticRegression(l1_ratio=0.7,
                                                                  max_iter=100000,
                                                                  penalty='elasticnet',
                                                                  solver='saga'))])),
                             ('grid_randomforest',
                              RandomForestClassifier(max_depth=20,
                                                     max_features=10,
                                                     min_samples_leaf=5,
                                                     min_samples_split=

In [43]:
pred = voting.predict(x_train_prepared) 

In [44]:
print(accuracy_score(pred, y_train))

0.8630751964085297


In [45]:
cv_scores = cross_val_score(voting, x_train_prepared, y_train, cv=3, scoring="accuracy")

In [46]:
cv_scores.mean()

0.830527497194164

In [47]:
x_test_prepared = full_pipeline.transform(x_test)

In [48]:
voting.fit(x_train_prepared, y_train)

VotingClassifier(estimators=[('poly_logres',
                              Pipeline(steps=[('poly_features',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('log_reg',
                                               LogisticRegression(l1_ratio=0.7,
                                                                  max_iter=100000,
                                                                  penalty='elasticnet',
                                                                  solver='saga'))])),
                             ('grid_randomforest',
                              RandomForestClassifier(max_depth=20,
                                                     max_features=10,
                                                     min_samples_leaf=5,
                                                     min_samples_split=

In [49]:
test_pred = voting.predict(x_test_prepared)

In [50]:
df = pd.DataFrame({
    'PassengerId':id_test,
    'Survived':test_pred
})

In [51]:
df.to_csv("results.csv", index=False)