In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Read training and test data
def read_data():
    data_dir = '../input'

    train_path = os.path.join(data_dir, 'train.csv')
    train_orig = pd.read_csv(train_path)
    print('Train Data Orig Shape:', train_orig.shape)

    test_path = os.path.join(data_dir, 'test.csv')
    test_orig = pd.read_csv(test_path)
    print('Test Data Orig Shape:', test_orig.shape)
    
    return train_orig, test_orig

if True:
    train_orig, test_orig = read_data()

In [None]:
# Prepare Dev Set
# Could also use : sklearn.model_selection.train_test_split(*arrays, **options)
def split_data(data, split_ratio=0.9):
    data = data.sample(frac=1)
    
    train_data = data.iloc[:int(data.shape[0]*split_ratio),:]
    print('Train Data Shape:', train_data.shape)

    dev_data = data.iloc[train_data.shape[0]:,:]
    print('Dev Data Shape:', dev_data.shape)
    
    assert len(dev_data.index.intersection(train_data.index)) == 0
    return train_data, dev_data
    
if True:
    train_data, dev_data = split_data(train_orig, split_ratio=0.5)

In [None]:
def preprocess(data, norms=None, test=False):
    ageMean = ageStd = fareMean = fareStd = None
    if norms is not None:
        ageMean, ageStd, fareMean, fareStd = norms
    
    temp = data.copy()
    
    temp['Pclass'] -= 2
    
    temp['Sex'] = (temp['Sex'].astype('category').cat.codes*2) -1
    
    if ageMean is None:
        ageMean = temp['Age'].mean()
    if ageStd is None:
        ageStd = temp['Age'].std()
    temp.loc[temp['Age'].isnull(), 'Age'] = ageMean
    temp['Age'] = (temp['Age'] - ageMean) / ageStd
    
    temp['SibSp'] = (temp['Parch'] / 4) - 1
    temp['Parch'] = (temp['Parch'] / 3) - 1 
    
    if fareMean is None:
        fareMean = temp['Fare'].mean()
    if fareStd is None:
        fareStd = temp['Fare'].std()
    temp.loc[temp['Fare'].isnull(), 'Fare'] = fareMean
    temp['Fare'] = (temp['Fare'] - fareMean) / fareStd
    
    temp.loc[temp['Embarked'].isnull(), 'Embarked'] = 'S'
    temp['Embarked'] = temp['Embarked'].astype('category').cat.codes - 1
    
    cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    if not test:
        cols.append('Survived')
    temp = temp[cols]
    
    norms = (ageMean, ageStd, fareMean, fareStd)
    return temp, norms

if True:
    train_data, norms = preprocess(train_data)
    dev_data, _ = preprocess(dev_data, norms)
    test_data, _ = preprocess(test_orig, norms, test=True)

In [None]:
# Plot train data
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 6)
if True:
    x = 'Fare'
    y = 'Age'
    survived_data = train_data[train_data['Survived'] == 1]
    plt.scatter(survived_data[x],survived_data[y], color='green', marker='o')
    not_survived_data = train_data[train_data['Survived'] == 0]
    plt.scatter(not_survived_data[x],not_survived_data[y], color='red', marker='x')

In [None]:
def prepare_data(data):
    X = data.drop(['Survived'], axis=1).as_matrix()
    Y = data['Survived'].as_matrix().reshape(-1, 1)
    print('Shape X: {}, Y: {}'.format(X.shape, Y.shape))
    return X, Y
if True:
    X, Y = prepare_data(train_data)
    X_dev, Y_dev = prepare_data(dev_data)
    X_test = test_data.as_matrix()
    print('Shape X test: {}'.format(X_test.shape))
    m, n = X.shape
    m_dev = X_dev.shape[0]
    m_test = X_test.shape[0]
    print('m:{}, n:{}, m_dev:{}, m_test:{}'.format(m, n, m_dev, m_test))

In [None]:
def init_weights(n):
    return np.zeros((n, 1)), 0
if False:
    W, b = init_weights(n)

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-1 * z))
# print('Sigmoid -10', sigmoid(-10))
# print('Sigmoid 10', sigmoid(10))
# print('Sigmoid 0', sigmoid(0))
# print('Sigmoid arr', sigmoid(np.array([-10, 0, 10])))

In [None]:
def compute_cost(X, Y, W, b):
    m = X.shape[0]
#     print('m=', m)
#     print(W.shape)
#     print(X, W, b)
    Z = np.dot(X, W) + b
#     print(Z)
    A = sigmoid(Z)
#     print(A)
    return -1 * ((Y * np.log(A)) + ((1-Y) * np.log(1-A))).sum() / m
if False:
    debug_cost = compute_cost(np.array([1,2,3,4]).reshape(2,2), np.array([1, 0]).reshape(2,1), np.array([0, 1]).reshape(2,1), 1)
    print('Debug cost (Expected 2.5276513500314395):', debug_cost)
# if True:
#     debug_cost1 = compute_cost(np.array([1,2,3,4]).reshape(2,2), np.array([1, 0]).reshape(2,1), np.array([0 + 1.e-7, 1]).reshape(2,1), 1)
#     debug_cost2 = compute_cost(np.array([1,2,3,4]).reshape(2,2), np.array([1, 0]).reshape(2,1), np.array([0 - 1.e-7, 1]).reshape(2,1), 1)
#     print('grad numer: ', (debug_cost1 - debug_cost2) / (2.e-7))
#     print('Debug cost', debug_cost1, debug_cost2)

In [None]:
def compute_grads(X, Y, W, b):
    m = X.shape[0]
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    dW = np.dot(X.T, A - Y) / m
    db = (A-Y).sum() / m
    return dW, db
if False:
    debug_grads = compute_grads(np.array([1,2,3,4]).reshape(2,2), np.array([1, 0]).reshape(2,1), np.array([0, 1]).reshape(2,1), 1)
    print('Debug grads', debug_grads)

In [None]:
def check_gradients(X, Y, W, b, dW, db, epsilon=1.e-7):
    dW_numer, db_numer = np.zeros(dW.shape), 0
    for i in range(dW.shape[0]):
        dW_temp = dW.copy()
#         print(i)
#         print(dW_temp)
        dW_temp.reshape(-1)[i] += epsilon
#         print(dW_temp)
        cost_right = compute_cost(X, Y, dW_temp, b)
        dW_temp = dW.copy()
        dW_temp.reshape(-1)[i] -= epsilon
#         print(dW_temp)
        cost_left = compute_cost(X, Y, dW_temp, b)
        dW_numer.reshape(-1)[i] = (cost_right - cost_left) / (2 * epsilon)
    cost_right = compute_cost(X, Y, dW, b + epsilon)
    cost_left = compute_cost(X, Y, dW, b - epsilon)
    db_numer = (cost_right - cost_left) / (2 * epsilon)
    
    orig_params = np.concatenate((dW, np.array(db).reshape(1,1)), axis=0)
#     print(orig_params)
    numer_params = np.concatenate((dW_numer, np.array(db_numer).reshape(1,1)), axis=0)
#     print(numer_params)
    return np.square(orig_params - numer_params).sum() / (np.square(orig_params).sum() + np.square(numer_params).sum())
    
if False:
    _X, _Y, _W, _b = np.array([1,2,3,4]).reshape(2,2), np.array([1, 0]).reshape(2,1), np.array([0, 1]).reshape(2,1), 1
    _dW, _db = compute_grads(_X, _Y, _W, _b)
    l2_norm = check_gradients(_X, _Y, _W, _b, _dW, _db)
    print('Check Gradients Score', l2_norm)

In [None]:
def update_params(W, b, dW, db, lrate):
    return W - (dW * lrate), b - (db * lrate)
if False:
    _W, _b = update_params(np.array([3,4]).reshape(2, 1), 5, np.array([0.1, 0.2]).reshape(2, 1), 2, 2)
    print('Updated params:', _W, _b)

In [None]:
def optimize(X, Y, lrate, niters, X_dev, Y_dev, print_costs=True, check_grads=False):
    m, n = X.shape
    W, b = init_weights(n)
    costs = []
    costs_dev = []
    for i in range(niters):
        cost = compute_cost(X, Y, W, b)
        costs.append(cost)
        cost_dev = compute_cost(X_dev, Y_dev, W, b)
        costs_dev.append(cost_dev)
        if print_costs and i % 10 == 0:
            print('Cost after {} iterations is {}'.format(i, cost))
        dW, db = compute_grads(X, Y, W, b)
        if check_grads and i % 10 == 0:
            print('Gradient check score at iteration {} is {}'.format(i, check_gradients(X, Y, W, b, dW, db)))
        W, b = update_params(W, b, dW, db, lrate)
    return W, b, costs, costs_dev
if True:
    lrate = 0.05
    niters = 400
    W, b, costs, costs_dev = optimize(X, Y, lrate, niters, X_dev, Y_dev, check_grads=False)

In [None]:
print(compute_cost(X, Y, W, b))

In [None]:
%pylab inline
pylab.rcParams['figure.figsize'] = (20, 6)
def learning_curve(costs, costs_dev, niters):
    x = np.arange(niters)
    y = costs
    plt.plot(x,y,color='blue')
    y = costs_dev
    plt.plot(x,y,color='green')
    plt.show()
if True:
    learning_curve(costs, costs_dev, niters)

In [None]:
def predict(X, W, b):
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    return np.where(A<=0.5, 0, 1)
if True:
    predictions = predict(X, W, b)
    predictions_dev = predict(X_dev, W, b)

In [None]:
def compute_accuracy(Y, predictions):
    return (1 - (np.abs(Y - predictions).sum() / Y.shape[0])) * 100
if True:
    accuracy = compute_accuracy(Y, predictions)
    accuracy_dev = compute_accuracy(Y_dev, predictions_dev)
    print('Training Accuracy: {}%'.format(accuracy))
    print('Dev Accuracy: {}%'.format(accuracy_dev))

In [None]:
data_dir = './'
sumbission_path = os.path.join(data_dir, 'gender_submission_dev_eq_test.csv')
predictions_test = predict(X_test, W, b)
submission_data = test_orig[['PassengerId']]
submission_data = submission_data.assign(Survived = pd.Series(predictions_test.reshape(-1)))
submission_data.to_csv(sumbission_path, index=False)
submission_data.columns

In [None]:
data_dir = './'
sumbission_path = os.path.join(data_dir, 'gender_submission_full_training.csv')
sub_data = pd.read_csv(sumbission_path)
sub_data

In [None]:
# Using 3rd party lib
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model = model.fit(X, Y.reshape(-1))

In [None]:
model.score(X, Y.reshape(-1))