In [1]:
import pandas as pd

## Read the CSV

In [2]:
# loading the dataset
df = pd.read_csv("resources/train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Process features

In [None]:
# check the number of missing values in the data
df.isnull().sum()

In [None]:
# drop rows with missing values in the 'Embarked' column
df = df.dropna(subset=['Embarked'])

In [None]:
# basic feature engineering of columns Name, Cabin, and addition of Family Size, Age Class, and Fare per Person courtesy of:
# https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/
# starting with titles: function that searches for substrings
import strings
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            return substring
    print big_string
    return np.nan

# list of titles
title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

# recombine list to the four categories
df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

# replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
# titles now extracted from 'Name'
df['Title'] = df.apply(replace_titles, axis=1)

In [None]:
# turning 'Cabin' number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

In [None]:
# replace null values in Age by its median
df['Age'] = df['Age'].fillna(df['Age'].median())

In [None]:
# creating new family_size column with linear combination of features
df['Family_Size'] = df['SibSp']+df['Parch']

# creating interaction term, 'Age'*'Class'
df['Age*Class'] = df['Age']*df['Pclass']

# create fare per person column
df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)

In [None]:
# check the number of missing values in the data
df.isnull().sum()

In [None]:
# create binary encoded data from 'Sex' column with dummy encoding
df = pd.get_dummies(df, columns=["Sex"])

# drop redundant data
df = df.drop(["Sex_male", axis=1])

## Select features

In [None]:
# Set features. This will also be used as your x values.
selected_features = df.drop(["PassengerId", "Survived", "Name", "Cabin"], axis=1)
feature_names = selected_features.columns

## Create a Train Test Split

In [None]:
# assign X(data) and y(target)
data = selected_features
target = df["Survived"]
#target = df["Survived"].values.reshape(-1, 1)
print(data.shape, target.shape)

In [None]:
# split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
X_train

## Pre-processing

In [None]:
# create a random forest classifier to automatically calculate feature importance
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

In [None]:
# sort the features by their importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
print(X_train_scaled.shape, X_test_scaled.shape)

In [None]:
# use tree-based estimator coupled with SelectFromModel meta-transformer...
# ...to compute impurity-based feature importances and discard irrelevant features 
from sklearn.feature_selection import SelectFromModel
sfmodel = SelectFromModel(rf, prefit=True)
X_train_scaled = sfmodel.transform(X_train_scaled)
X_test_scaled = sfmodel.transform(X_test_scaled)
print(X_train_scaled.shape, X_test_scaled.shape)

In [None]:
sfmodel.get_support()

In [None]:
# print the names of the most important features
for feature_list_index in sfmodel.get_support(indices=True):
    print(feature_names[feature_list_index])

In [None]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the model

In [None]:
# create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
# fit(train) our model using the training data
classifier.fit(X_train_scaled, y_train)

In [None]:
# validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
pred_df = pd.DataFrame({"Predictions": classifier.predict(X_test_scaled), "Actual": y_test})
pred_df

## Hyperparameter Tuning

In [None]:
# Create the GridSearchCV model/estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'random_state': [0, 42],
              'tol': [1, 100, 1000, 10000, 10000, 1000000, 10000000],
              'degree': [1, 2, 3],
              'kernel': ['linear', 'poly', 'rbf'],
              'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(classifier, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch estimator...
# this will take the model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

In [None]:
# list the best parameters for this dataset, then list the best score
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [None]:
grid_df = pd.DataFrame({"Predictions": predictions, "Actual": y_test})
grid_df

In [None]:
# calculate classification report
# precision = true positive / (true positive + false positive)
# recall = true positive / (true positive + false negative)
# f1 = 2*((precision * recall) / (precision + recall)): best at 1, worst at 0
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

## Test the model

In [None]:
X_train = 
y_train = 

X_test = 