In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

train = pd.read_csv('../input/train.csv')
holdout = pd.read_csv('../input/test.csv')

gender_submission.csv
test.csv
train.csv



**Dataset Overview**

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Preprocessing tha data**

In [3]:
def preprocess(df):
# Missing data
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    
# Age
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    
# Fare
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    
# Cabin
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    
    return df

def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [4]:
def preprocess2(df):
    df = preprocess(df)

    for col in ["Age_categories","Fare_categories",
                "Cabin_type","Sex"]:
        df = create_dummies(df,col)
    
    return df

train = preprocess2(train)
holdout = preprocess2(holdout)

**Selecting the Best-Performing Features**

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
# select numerical columns and remove columns with null value
train = train.select_dtypes([np.number]).dropna(axis = 1)
all_X = train.drop(["PassengerId","Survived"], axis = 1)
all_y = train["Survived"]

clf = RandomForestClassifier(random_state = 1)
selector = RFECV(clf, cv = 10)
selector.fit(all_X, all_y)

best_columns = list(all_X.columns[selector.support_])
print(best_columns)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Fare_categories_0-12', 'Fare_categories_12-50', 'Fare_categories_50-100', 'Cabin_type_B', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male']


**Selecting and Tuning Different Algorithms**

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

all_X = train[best_columns]
all_y = train["Survived"]

models = [
        {
            "name": "LogisticRegression",
            "estimator": LogisticRegression(),
            "hyperparameters":
                {
                    "solver": ["newton-cg", "lbfgs", "liblinear"]
                }
        },
        {
            "name": "KNeighborsClassifier",
            "estimator": KNeighborsClassifier(),
            "hyperparameters":
                {
                    "n_neighbors": range(1,20,2),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1,2]
                }
        },
        {
            "name": "RandomForestClassifier",
            "estimator": RandomForestClassifier(random_state=1),
            "hyperparameters":
                {
                    "n_estimators": [4, 6, 9],
                    "criterion": ["entropy", "gini"],
                    "max_depth": [2, 5, 10],
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1, 5, 8],
                    "min_samples_split": [2, 3, 5]

                }
        }
    ]

for model in models:
    print(model["name"])
    print('-'*len(model["name"]))
    
    grid = GridSearchCV(model["estimator"],
                        param_grid=model["hyperparameters"],
                        cv=10)
    grid.fit(all_X, all_y)
    model["best_params"] = grid.best_params_
    model["best_score"] = grid.best_score_
    model["best_model"] = grid.best_estimator_
    
    print("Best Score: {}".format(model["best_score"]))
    print("Best Params: {}".format(model["best_params"]))

LogisticRegression
------------------
Best Score: 0.8013468013468014
Best Params: {'solver': 'newton-cg'}
KNeighborsClassifier
--------------------
Best Score: 0.7654320987654321
Best Params: {'algorithm': 'kd_tree', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
RandomForestClassifier
----------------------
Best Score: 0.8316498316498316
Best Params: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 9}


RandomForest model seems to be the best in this case.

**Submission**

In [7]:
best_model = models[2]["best_model"]
holdout_data = holdout[best_columns]
predictions = best_model.predict(holdout_data)

holdout_ids = holdout["PassengerId"]
submission_df = {"PassengerId": holdout_ids,
                 "Survived": predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv('submission.csv', index=False)
