In [None]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn import svm
import time
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import math
import matplotlib.ticker as plticker
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics 
from sklearn.ensemble import AdaBoostClassifier
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.decomposition import FactorAnalysis



In [None]:
pos_outcome_var = "Survived"
neg_outcome_var = "Died"

In [None]:
def drop_columns(df, columns_to_drop):
    for col in columns_to_drop:
        del df[col]  
# Read in data
big_df = pd.read_csv("../input/titanic-cleaned-data/train_clean.csv")
big_df.info()
columns_to_drop = ["Cabin", "Name", "Ticket", "Parch", "Embarked", "Title", "PassengerId"]  # TODO include reasoning for dropping these
drop_columns(big_df, columns_to_drop)
is_male = {"male": 1, "female": 0}
big_df["Sex"].replace(is_male, inplace=True)
all_y = big_df[pos_outcome_var]
all_x  = big_df.drop(pos_outcome_var, axis=1)


In [None]:
def compute_k_means(x_data, start=1, end=50):
    ssds = []
    binary_labels = []
    n_iter = []
    binary_k_means = None
    print(x_data)
    for k in range(start, end + 1):
        start = time.time()
        k_means = KMeans(n_clusters=k, random_state=0).fit(x_data)
        end = time.time()
        n_iter.append(end - start)
        ssds.append(k_means.inertia_)
        #n_iter.append(k_means.n_iter_)
        if k == 2:
            binary_labels = k_means.labels_
            binary_k_means = k_means
    return (ssds, binary_labels, binary_k_means, n_iter)

def plot_cluster_accuracy(data, title="Titanic SSD over K for KMC", print_default=True,\
                          x_label="Number of Clusters (k)", y_label="Sum of Squared Distances"):
    plt.rcParams["figure.figsize"] = (6,4)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.plot(range(1, len(data) + 1), data, '-o')
    if print_default:
        default_y = [data[0] / float(i) for i in range(1,len(data) + 1)]
        plt.plot(range(1, len(data) + 1), default_y, color="orange")
    plt.show()

def k_means_helper(x_reduced, big_df, outcome_var="Survived"):
    (ssds, binary_labels, binary_k_means, n_iter) = \
    compute_k_means(x_reduced)
    reduced_with_labels = pd.DataFrame(data=x_reduced, columns=["component1", "component2"])
    reduced_with_labels.insert(2,outcome_var, big_df[outcome_var], True)
    cluster_counts, cluster_col, cluster_samples = match_survived_to_clusters(reduced_with_labels, binary_k_means)
    reduced_with_labels.insert(2,"ClusterId", cluster_col, True)
    return {
        "reduced_with_labels": reduced_with_labels,
        "cluster_counts": cluster_counts,
        "log_likelihoods": log_likelihoods, 
        "binary_weights": binary_weights,
        "ssds": ssds,
        "n_iter": n_iter,
        "binary_k_means": binary_k_means,
        "cluster_samples": cluster_samples
    }

In [None]:
def compute_EM(x_data, start=1, end=50):
    binary_weights = []
    log_likelihoods = []
    n_iter = []
    binary_EM = None
    for k in range(start, end + 1):
        start = time.time()
        EM = GaussianMixture(n_components=k, random_state=22).fit(x_data)
        end = time.time()
        n_iter.append(end - start)
        log_likelihoods.append(EM.lower_bound_)
        #TODO iterations
        
        if k == 2:
            binary_EM = EM
            binary_weights = EM.weights_
            
    return (log_likelihoods, binary_weights, n_iter, binary_EM)

def get_cluster_samples(cluster_rows):
    cluster_dfs = [
        big_df[big_df.index.isin(cluster_rows[0])], 
        big_df[big_df.index.isin(cluster_rows[1])]
    ]
    print("cluster_0")
    cluster_dfs[0].sample(n=30, random_state=1).describe()
    print("cluster_0")
    cluster_dfs[1].sample(n=30, random_state=1).describe()
    return cluster_dfs
    
def match_survived_to_clusters(all_data, EM):
    cluster_counts = [{pos_outcome_var: 0, neg_outcome_var: 0},
    {pos_outcome_var: 0, neg_outcome_var: 0}]
    #cluster_id = pd.DataFrame(columns=['Cluster_ID'])
    cluster_col = np.zeros(len(all_data))
    cluster_rows = [set(), set()]
    for i,row in all_data.iterrows():
        cluster_idx = EM.predict(row.drop(pos_outcome_var).values.reshape(1, -1))[0]
        key_to_increment = pos_outcome_var if all_data.iloc[i,:][pos_outcome_var] else neg_outcome_var
        cluster_col[i] = cluster_idx
        cluster_counts[cluster_idx][key_to_increment] += 1
        cluster_rows[cluster_idx].add(i)
    print("len cluster rows 0: ", len(cluster_rows[0]))
    print("len cluster rows 1: ", len(cluster_rows[1]))

    cluster_analysis = [
        big_df[big_df.index.isin(cluster_rows[0])], 
        big_df[big_df.index.isin(cluster_rows[1])]
    ]
        
    return (cluster_counts, cluster_col, cluster_analysis)

def EM_helper(x_reduced, big_df, outcome_var="Survived"):
    (log_likelihoods, binary_weights, n_iter, binary_EM) = \
    compute_EM(x_reduced)
    reduced_with_labels = pd.DataFrame(data=x_reduced, columns=["component1", "component2"])
    reduced_with_labels.insert(2,outcome_var, big_df[outcome_var], True)
    cluster_counts, cluster_col, cluster_samples = match_survived_to_clusters(reduced_with_labels, binary_EM)
    reduced_with_labels.insert(2,"ClusterId", cluster_col, True)
    return {
        "reduced_with_labels": reduced_with_labels,
        "cluster_counts": cluster_counts,
        "log_likelihoods": log_likelihoods, 
        "binary_weights": binary_weights, 
        "n_iter": n_iter,
        "binary_EM": binary_EM,
        "cluster_samples": cluster_samples
    }
    

    
    

In [None]:
(ssds, binary_labels, binary_k_means, n_iter) = compute_k_means(all_x)
(cluster_counts_k_means, cluster_col_k_means, cluster_analysis_k_means) = match_survived_to_clusters(big_df, binary_k_means)
#print(cluster_counts)
plot_cluster_accuracy(n_iter, title="Titanic KMC Time Training", print_default=False, y_label= "Seconds")

plot_cluster_accuracy(ssds, title="Titanic KMC Sum of Squared Distances")

In [None]:
cluster_analysis_k_means[0].describe()


In [None]:
cluster_analysis_k_means[1].describe()

In [None]:
(log_likelihoods, binary_weights, n_iter, binary_EM) = compute_EM(all_x)
print("log_likelihoods EM: ", log_likelihoods)
print("binary_weights EM: ", binary_weights)
plot_cluster_accuracy(log_likelihoods, title="Titanic EM Log Likelihoods", print_default=False, y_label= "log_likelihoods")
plot_cluster_accuracy(n_iter, title="Titanic EM Training Time", print_default=False, y_label= "Seconds")
(cluster_counts_EM, cluster_col, cluster_analysis_EM) = match_survived_to_clusters(big_df, binary_EM)


In [None]:
cluster_analysis_EM[0].describe()

In [None]:
cluster_analysis_EM[1].describe()

In [None]:
pca_first = PCA(n_components=6)
pca_first.fit(all_x)
plt.xlabel("component number")
plt.ylabel("captured variance ratio")
plt.title("Titanic PCA component explanatory power")
plt.bar(np.arange(1, 7), pca_first.explained_variance_ratio_)
print(pca_first.explained_variance_ratio_)
pca = PCA(n_components=2)
pca.fit(all_x)
print(pca.explained_variance_ratio_)
#For PCA, what is the distribution of eigenvalues? 


In [None]:
x_reduced_pca = pca.fit_transform(all_x)
pca_k_means_info = k_means_helper(x_reduced_pca, big_df)
plot_cluster_accuracy(pca_k_means_info["ssds"], "Titanic PCA k means")
plot_cluster_accuracy(pca_k_means_info["n_iter"], title="Titanic PCA KMC Time Training", print_default=False, y_label= "Seconds")


pca_EM_info = EM_helper(x_reduced_pca, big_df)
print(pca_EM_info["cluster_counts"])
#pca_all_data.insert(3,"cluster_id", cluster_col_EM_pca, True)
plot_cluster_accuracy(pca_EM_info["n_iter"], title="PCA Titanic EM Time Training", print_default=False, y_label= "Seconds")


In [None]:
pca_k_means_info["cluster_samples"][0].describe()


In [None]:
pca_k_means_info["cluster_samples"][1].describe()

In [None]:
pca_EM_info["cluster_samples"][0].describe()


In [None]:
pca_EM_info["cluster_samples"][1].describe()

In [None]:
#how kurtotic are the distributions? 
#Do the projection axes for ICA seem to capture anything "meaningful"?
#did you get the same clusters as before? Different clusters? Why? Why not?
ica = FastICA(n_components=2)
x_reduced_ica = ica.fit_transform(all_x)

ica_k_means_info = k_means_helper(x_reduced_ica, big_df)
plot_cluster_accuracy(ica_k_means_info["ssds"], "Titanic ICA k means")
plot_cluster_accuracy(ica_k_means_info["n_iter"], title="Titanic ICA KMC Time Training", print_default=False, y_label= "Seconds")


ica_EM_info = EM_helper(x_reduced_ica, big_df)
print(ica_EM_info["cluster_counts"])

plot_cluster_accuracy(ica_EM_info["n_iter"], title="ICA Titanic EM Time Training", print_default=False, y_label= "Seconds")

print(ica_EM_info)

In [None]:
ica_k_means_info["cluster_samples"][0].describe()

In [None]:
ica_k_means_info["cluster_samples"][1].describe()

In [None]:
ica_EM_info["cluster_samples"][0].describe()


In [None]:
ica_EM_info["cluster_samples"][1].describe()


In [None]:
#Assuming you only generate k projections (i.e., you do dimensionality reduction), how well is the data reconstructed by the randomized projections? 
#Do the clusters change every time?
r_p = GaussianRandomProjection(n_components=2)
x_reduced_r_p = r_p.fit_transform(all_x)

r_p_k_means_info = k_means_helper(x_reduced_r_p, big_df)
plot_cluster_accuracy(r_p_k_means_info["ssds"], "Titanic RP k means")
plot_cluster_accuracy(r_p_k_means_info["n_iter"], title="Titanic RP KMC Time Training", print_default=False, y_label= "Seconds")


r_p_EM_info = EM_helper(x_reduced_r_p, big_df)


In [None]:
r_p_k_means_info["cluster_samples"][0].describe()

In [None]:
r_p_k_means_info["cluster_samples"][1].describe()

In [None]:
r_p_EM_info["cluster_samples"][0].describe()

In [None]:
r_p_EM_info["cluster_samples"][1].describe()

In [None]:
f_a = FactorAnalysis(n_components=2,random_state=3)
x_reduced_f_a = f_a .fit_transform(all_x)

f_a_k_means_info = k_means_helper(x_reduced_f_a, big_df)
plot_cluster_accuracy(f_a_k_means_info["ssds"], "Titanic Factor Analysis k means")
plot_cluster_accuracy(f_a_k_means_info["n_iter"], title="Titanic FA KMC Time Training", print_default=False, y_label= "Seconds")



f_a_EM_info = EM_helper(x_reduced_f_a, big_df)

In [None]:
f_a_k_means_info["cluster_samples"][0].describe()

In [None]:
f_a_k_means_info["cluster_samples"][1].describe()

In [None]:
f_a_EM_info["cluster_samples"][0].describe()

In [None]:
f_a_EM_info["cluster_samples"][1].describe()

In [None]:
# util functions

def eval_for_conclusion(model_id, clf, test_x, test_y):
    y_pred = clf.predict(test_x)
    print(classification_report(test_y, y_pred))
    print(confusion_matrix(test_y, y_pred))
    accuracy = metrics.accuracy_score(test_y, y_pred)
    precision = metrics.precision_score(test_y, y_pred)
    recall = metrics.recall_score(test_y, y_pred)
    print("Final {0} model accuracy:".format(model_id), accuracy)
    print("Final {0} model precision:".format(model_id), precision) 
    print("Final {0} model recall:".format(model_id), recall) 
    return {"model":model_id, "recall":recall, "accuracy":accuracy, "precision":precision}

 
def split_test_train(train_size, all_data):
    msk = np.random.rand(len(all_data)) < train_size
    train_df = all_data[msk]
    test_df = all_data[~msk]
    train_y = train_df[pos_outcome_var]
    train_x = train_df.drop(pos_outcome_var, axis=1)
    test_y = test_df[pos_outcome_var]
    test_x  = test_df.drop(pos_outcome_var, axis=1)
    return (train_x, train_y, test_x, test_y)

def cross_validate(all_data, model):
    depth = []
    all_y = all_data[pos_outcome_var]
    all_x  = all_data.drop(pos_outcome_var, axis=1)
    # Perform k-fold cross validation 
    scores = cross_val_score(estimator=model, X=all_x, y=all_y, cv=5, n_jobs=4)
    depth.append((i,scores.mean()))
    return depth
    
def train_and_test(all_data, model):
    test_scores = []
    train_scores = []
    times = []
    for i in range(1,10):
        (train_x, train_y, test_x, test_y) = split_test_train(0.1 * i, big_df)
        #print("len test: ", len(test_x), ", len train: ", len(train_x))
        start = time.time()
        #TODO iterations
        model.fit(train_x, train_y)
        end = time.time()
        times.append(end - start)
        pred_test_y = model.predict(test_x) # TODO add wallclock time
        test_score = round(model.score(test_x, test_y) * 100, 2)
        pred_train_y = model.predict(train_x)
        train_score = round(model.score(train_x, train_y) * 100, 2)
        test_scores.append(test_score)
        train_scores.append(train_score)
    return (test_scores, train_scores, times)

def plot_data(x_vars, x_label, all_y_vars, y_var_labels, y_label, title, y_bounds=None):
    plt.rcParams["figure.figsize"] = (4,3)
    colors = ['red','orange','black','green','blue','violet']
    i = 0
    for y_var in all_y_vars:
#         if i == 2: # don't plot when i = 1 for cv
#             x_vars = x_vars[1:]
        plt.plot(x_vars, y_var, 'o-', color=colors[i % 6], label=y_var_labels[i])
        i += 1
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    if y_bounds != None:
        plt.ylim(y_bounds)
    leg = plt.legend();
    plt.show()

def evaluate_model(all_data, model, model_id):
    (test_scores, train_scores, times) = train_and_test(all_data, model)
    print("{0} train timings (seconds): {1}".format(model_id, times))
    print("{0} test set scores: {1} ".format(model_id, test_scores))
    print("{0} train set scores: {1}".format(model_id, train_scores))
    plot_data([x * 10 for x in range(1,10)], "Percentage of data in training set", [test_scores, train_scores],\
              ["test_scores", "train_scores"], "Accuracy", "{0} Accuracy Over Train/Test Split".format(model_id), (50,105))
    plot_data([x * 10 for x in range(1,10)], "Percentage of data in training set", [times],
             ["times"], "Train time in Seconds", "{0} Time Spent Training Over Train/Test Split".format(model_id))
    return (test_scores, train_scores, times)

def plot_grid_search(grid_results, plotting_func, title, x_label, y_label, grid_size, model_handles):
    means = grid_results.cv_results_['mean_test_score']
    stds = grid_results.cv_results_['std_test_score']
    params = grid_results.cv_results_['params']
    plt.rcParams["figure.figsize"] = grid_size
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.subplots
    ax = plt.subplot()
    
    for mean, std, params in zip(means, stds, params):
        #print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        plotting_func(mean, params, plt, ax)
    if model_handles: plt.legend(handles=model_handles)
    plt.show()


#def grid_search(model, params, x_train, y_train, x_test, y_test):
    

#TODO come up with graphing function that takes in two arrays of test and train and plots them

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(x_reduced_pca, neural_net_classifier, "Titanic NN PCA")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(x_reduced_ica, neural_net_classifier, "Titanic NN ICA")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(x_reduced_r_p, neural_net_classifier, "Titanic NN Randomized Projections")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(x_reduced_f_a, neural_net_classifier, "Titanic NN Factor Analysis")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(pca_EM_info["reduced_with_labels"], neural_net_classifier, "Titanic NN PCA with labels")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(ica_EM_info["reduced_with_labels"], neural_net_classifier, "Titanic NN ICA with labels")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(r_p_EM_info["reduced_with_labels"], neural_net_classifier, \
               "Titanic NN Randomized Projections with labels")

In [None]:
neural_net_classifier = MLPClassifier(max_iter=10000) #, alpha=0.01, hidden_layer_sizes=(6, 3), random_state=1)
# tried with6,3 and works great. Other dimensions are horrible
evaluate_model(f_a_EM_info["reduced_with_labels"], neural_net_classifier, \
               "Titanic NN Factor Analysis with labels")