In [1]:
import sys
import json
import numpy as np
import pandas as pd
from pandas import DataFrame

In [2]:
train_data = json.load(open('../data/train.json'))

In [9]:
test_data = json.load(open('../data/test.json'))

## Create CSV for `test` data

In [11]:
def create_X_y_test(data):
    X = []
    for line in data:
        sub_list = [line['id']]
        sub_list.append(line['inc_angle'])
        sub_list.extend(line['band_1'])
        sub_list.extend(line['band_2'])
        X.append(sub_list)
    return np.array(X)

def csv_test():
    data=test_data
    X = create_X_y_test(test_data)    
    band_1 = ['b1_' + str(x) for x in range(len(data[0]['band_1']))]
    band_2 = ['b2_' + str(x) for x in range(len(data[0]['band_2']))]
    column_names = ['id'] + ['inc_angle'] + band_1 + band_2
    df = DataFrame(data=X, columns=column_names)

    df.to_csv("test_data.csv", index=False)
    
csv_test()

### Create CSV for `train` data with `inc_angle` feature removed

In [3]:
def create_X_y(data):
    X = []
    for line in data:
        sub_list = [line['id']]
        sub_list.append(line['inc_angle'])
        sub_list.extend(line['band_1'])
        sub_list.extend(line['band_2'])
        sub_list.append(line['is_iceberg'])
        X.append(sub_list)
    return np.array(X)

def csv_train_inc_feature_removed():
    data=train_data
    X = create_X_y(train_data)    
    band_1 = ['b1_' + str(x) for x in range(len(data[0]['band_1']))]
    band_2 = ['b2_' + str(x) for x in range(len(data[0]['band_2']))]
    column_names = ['id'] + ['inc_angle'] + band_1 + band_2 + ['is_iceberg']
    df = DataFrame(data=X, columns=column_names)

    del df['inc_angle']

    df.to_csv("data_processed_angle_removed.csv", index=False)
    
csv_train_inc_feature_removed()

### Create CSV for `train` data removing any row with missing `inc_angle`

In [5]:
def create_X_y_ignore_missing_inc_rows(data):
    X = []
    for line in data:
        if line['inc_angle'] == 'na':
            continue
        sub_list = [line['id']]
        sub_list.append(line['inc_angle'])
        sub_list.extend(line['band_1'])
        sub_list.extend(line['band_2'])
        sub_list.append(line['is_iceberg'])
        X.append(sub_list)
    return np.array(X)

def csv_train_inc_angle_missing_rows_removed():
    data=train_data
    X = create_X_y_ignore_missing_inc_rows(train_data)
    band_1 = ['b1_' + str(x) for x in range(len(data[0]['band_1']))]
    band_2 = ['b2_' + str(x) for x in range(len(data[0]['band_2']))]
    column_names = ['id'] + ['inc_angle'] + band_1 + band_2 + ['is_iceberg']
    df = DataFrame(data=X, columns=column_names)
    df.to_csv("data_processed_rows_eliminated.csv", index=False)    
    
csv_train_inc_angle_missing_rows_removed()

### Create CSV for `train` data by merging `band_1` and `band_2` and remove rows with missing `inc_angle`

In [8]:
import math
def create_X_y_merge_bands_ignore_missing_inc_rows(data):
    X = []
    for line in data:
        if line['inc_angle'] == 'na':
            continue
        band = [math.log(math.exp(float(elem[0]))+math.exp(float(elem[1]))) for elem in zip(line['band_1'], line['band_2'])]
        sub_list = [line['id']]
        sub_list.append(line['inc_angle'])
        sub_list.extend(band)
        sub_list.append(line['is_iceberg'])
        X.append(sub_list)
    return np.array(X)

def csv_train_bands_merged_inc_angle_missing_rows_removed():    
    data = train_data
    X = create_X_y_merge_bands_ignore_missing_inc_rows(data)
    band = ['b' + str(x) for x in range(len(data[0]['band_1']))]
    column_names = ['id'] + ['inc_angle'] + band + ['is_iceberg']
    df = DataFrame(data=X, columns=column_names)
    df.to_csv("data_processed_bands_combined.csv", index=False)
    
csv_train_bands_merged_inc_angle_missing_rows_removed()

## Create PCA for `train` data

In [17]:
from sklearn.decomposition import PCA

def read_train_pca_csv(infile):
    # Skip the header
    header = infile.readline().rstrip().split(',')
    x_angle = []
    x_id = []
    X = []
    y = []
    for line in infile:
        line = line.rstrip().split(',')
        # Skip the ID
        x_id.append(line[0])
        x_angle.append(float(line[1]))
        X.append([float(x) for x in line[2:-1]])
        y.append(int(line[-1]))
    return np.array(x_id), np.array(x_angle), np.array(X), np.array(y)

def read_test(infile):
    # Skip the header
    header = infile.readline().rstrip().split(',')
    x_angle = []
    x_id = []
    X = []
    y = []
    for line in infile:
        line = line.rstrip().split(',')
        # Skip the ID
        x_id.append(line[0])
        x_angle.append(float(line[1]))
        X.append([float(x) for x in line[2:]])
    return np.array(x_id), np.array(x_angle), np.array(X)

def fit_pca(infile,outfile,num_basis):
#     parser = ArgumentParser(description="PCA dimension reduction")
#     parser.add_argument('-i', '--infile', type=argparse.FileType('r'), 
#             help="csv file format with label in the last column", default=sys.stdin)
#     parser.add_argument('-t', '--testfile', type=argparse.FileType('r'), 
#             help="csv file format without label in the last column", default=sys.stdin)
#     parser.add_argument('-o', '--outfile', type=str, help="output file name")
#     parser.add_argument('-n', '--num_basis', type=int, default=100, help="number of basis vectors to use for dimension reduction")
#     args = parser.parse_args()

    #####################################################################
    # Prepare X and y from the input txt file
    f = open(infile, 'r+')
    train_id, train_angle, X_train, y_train = read_train_pca_csv(f)

    train_id = train_id.reshape((train_id.shape[0],1))
    train_angle = train_angle.reshape((train_angle.shape[0],1))
    y_train = y_train.reshape((y_train.shape[0],1))

    # project the feature space up n dimensions
    pca = PCA(num_basis)
    pca.fit(X_train)

    X_train = pca.transform(X_train)

    band = ['b' + str(i) for i in range(X_train.shape[1])]
    
    # Dataset without angle
    # data_wo_angle = np.hstack((x_id, X, y))
    # column_names = ['id'] + band + ['is_iceberg']
    # df_wo = pd.DataFrame(data_wo_angle, columns=column_names)
    # filename = "data/data_processed_pca" + str(n) + "wo_angle.csv"
    # df_wo.to_csv(filename, index=False)

    # Dataset with angle (angle + PCA eigenvectors)
    data_w_angle = np.hstack((train_id, train_angle, X_train, y_train))
    column_names = ['id'] + ['inc_angle'] + band + ['is_iceberg']
    df_w = pd.DataFrame(data_w_angle, columns=column_names)
    filename = outfile + '_' + str(num_basis) + ".csv"
    df_w.to_csv(filename, index=False)

    # Testset
#     print("Transforming test set")
#     X_test = pca.transform(X_test)

#     data_w_angle = np.hstack((test_id, test_angle, X_test))
#     column_names = ['id'] + ['inc_angle'] + band 
#     df_w = pd.DataFrame(data_w_angle, columns=column_names)
#     filename = "data/test_" + args.outfile + str(n) + ".csv"
#     df_w.to_csv(filename, index=False)

fit_pca('data_processed_rows_eliminated.csv','pca_data_processed',50)

# MODELS — k-NN, NN, SVM, RF

In [24]:
import sklearn.svm
import sklearn.ensemble
import sklearn.neighbors
import sklearn.neural_network
import sklearn.model_selection
import sklearn.metrics

def support_vector_machine(X_train, y_train, X_test, C, kernel, degree, gamma):
    if C==None:
        C=1.0
    if kernel==None:
        kernel="rbf"
    if degree==None:
        degree=3
    if gamma==None:
        gamma='auto'
    clf = sklearn.svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma)
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def random_forest(X_train, y_train, X_test, n, criterion, minss):
    if n==None:
        n=10
    if criterion==None:
        criterion="gini"
    if minss==None:
        minss=2
    clf = sklearn.ensemble.RandomForestClassifier(n_estimators=n, criterion=criterion, min_samples_split=minss)
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def k_nearest_neighbor(X_train, y_train, X_test, n, weights):
    if n==None:
        n=5
    if weights==None:
        weights="uniform"
    neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n, weights=weights)
    neigh.fit(X_train, y_train)
    return neigh.predict(X_test)
    
def neural_network(X_train, y_train, X_test, hls, activation, solver, alpha): 
    if hls==None:
        hls=(100,)
    if activation==None:
        activation="relu"
    if solver==None:
        solver="adam"
    if alpha==None:
        alpha=0.0001
    clf = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=hls, activation=activation, solver=solver, alpha=alpha)
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

## K-Fold CV

In [25]:
def prepare_train_test_using_kfold(k, X, y):
    trainset, testset = [], []
    kf = sklearn.model_selection.KFold(n_splits=k, shuffle=True)
    kf.get_n_splits(X)
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    print(kf)
    for train_index, test_index in kf.split(X):
        X_train.append(X[train_index])
        y_train.append(y[train_index])
        X_test.append(X[test_index])
        y_test.append(y[test_index])
    return X_train, y_train, X_test, y_test

## k-NN

In [26]:
def read_knn_csv(infile):
    # Skip the header
    header = infile.readline().rstrip().split(',')
    X = []
    y = []
    for line in infile:
        line = line.rstrip().split(',')
        # Skip the ID
        X.append([float(x) for x in line[1:-1]])
        y.append(int(line[-1]))
    return np.array(X), np.array(y)

def perform_knn(infile, k_fold):
    # Prepare X and y from the input txt file
    f = open(infile, 'r+')
    X, y = read_knn_csv(f)

    # Get trainsets and testsets using K-Fold
    X_train, y_train, X_test, y_test = prepare_train_test_using_kfold(k_fold, X, y)

    max_n = 19
    weights_tuple = ("uniform", "distance")

    print('-'*60)
    print("k nearest algorithm")
    
    for weights in weights_tuple:
        for n in range(1, max_n, 2):
            accuracy_list = np.zeros(k_fold)
            for i in range(k_fold):
                pred = k_nearest_neighbor(X_train[i], y_train[i], X_test[i], n, weights)
                accuracy = sklearn.metrics.accuracy_score(y_test[i], pred)
                accuracy_list[i] = accuracy
            print("{0:.3f},n={1},weights={2},kNN".format(accuracy_list.mean(),n,weights))

In [27]:
perform_knn("data_processed_angle_removed.csv", 5)

KFold(n_splits=5, random_state=None, shuffle=True)
------------------------------------------------------------
k nearest algorithm
0.733,n=1,weights=uniform,kNN
0.747,n=3,weights=uniform,kNN
0.746,n=5,weights=uniform,kNN
0.745,n=7,weights=uniform,kNN
0.739,n=9,weights=uniform,kNN
0.735,n=11,weights=uniform,kNN
0.728,n=13,weights=uniform,kNN
0.724,n=15,weights=uniform,kNN
0.724,n=17,weights=uniform,kNN
0.733,n=1,weights=distance,kNN
0.747,n=3,weights=distance,kNN
0.746,n=5,weights=distance,kNN
0.745,n=7,weights=distance,kNN
0.739,n=9,weights=distance,kNN
0.735,n=11,weights=distance,kNN
0.728,n=13,weights=distance,kNN
0.724,n=15,weights=distance,kNN
0.724,n=17,weights=distance,kNN


In [28]:
perform_knn("data_processed_rows_eliminated.csv", 5)

KFold(n_splits=5, random_state=None, shuffle=True)
------------------------------------------------------------
k nearest algorithm
0.749,n=1,weights=uniform,kNN
0.758,n=3,weights=uniform,kNN
0.751,n=5,weights=uniform,kNN
0.745,n=7,weights=uniform,kNN
0.750,n=9,weights=uniform,kNN
0.742,n=11,weights=uniform,kNN
0.731,n=13,weights=uniform,kNN
0.727,n=15,weights=uniform,kNN
0.727,n=17,weights=uniform,kNN
0.749,n=1,weights=distance,kNN
0.758,n=3,weights=distance,kNN
0.751,n=5,weights=distance,kNN
0.745,n=7,weights=distance,kNN
0.750,n=9,weights=distance,kNN
0.742,n=11,weights=distance,kNN
0.731,n=13,weights=distance,kNN
0.727,n=15,weights=distance,kNN
0.727,n=17,weights=distance,kNN


In [29]:
perform_knn("data_processed_bands_combined.csv", 5)

KFold(n_splits=5, random_state=None, shuffle=True)
------------------------------------------------------------
k nearest algorithm
0.701,n=1,weights=uniform,kNN
0.707,n=3,weights=uniform,kNN
0.697,n=5,weights=uniform,kNN
0.703,n=7,weights=uniform,kNN
0.684,n=9,weights=uniform,kNN
0.689,n=11,weights=uniform,kNN
0.686,n=13,weights=uniform,kNN
0.684,n=15,weights=uniform,kNN
0.683,n=17,weights=uniform,kNN
0.701,n=1,weights=distance,kNN
0.707,n=3,weights=distance,kNN
0.697,n=5,weights=distance,kNN
0.703,n=7,weights=distance,kNN
0.684,n=9,weights=distance,kNN
0.689,n=11,weights=distance,kNN
0.686,n=13,weights=distance,kNN
0.684,n=15,weights=distance,kNN
0.683,n=17,weights=distance,kNN


## Random Forest

In [32]:
def read_rf_csv(infile):
    # Skip the header
    header = infile.readline().rstrip().split(',')
    X = []
    y = []
    for line in infile:
        line = line.rstrip().split(',')
        # Skip the ID
        X.append([float(x) for x in line[1:-1]])
        y.append(int(line[-1]))
    return np.array(X), np.array(y)

def perform_random_forest(infile, k_fold):
    f = open(infile, 'r+')
    
    # Prepare X and y from the input txt file
    X, y = read_rf_csv(f)

    # Get trainsets and testsets using K-Fold
    X_train, y_train, X_test, y_test = prepare_train_test_using_kfold(k_fold, X, y)

    # number of trees: 4, 8, ... , 4096
    # minss: 2, 4, ... , 32 
    criterion_tuple = ("gini", "entropy")
    n_list = np.power(2, np.arange(2, 13))
    minss_list = np.power(2, np.arange(1, 6))
    
    # Random Forest
    print('-'*60)
    print("\nRandom Forest")
    for criterion in criterion_tuple:
        for n in n_list:
            for minss in minss_list:
                accuracy_list = np.zeros(k_fold)
                for i in range(k_fold):
                    pred = random_forest(X_train[i], y_train[i], X_test[i], n, criterion, minss)
                    accuracy = sklearn.metrics.accuracy_score(y_test[i], pred)
                    accuracy_list[i] = accuracy
                print("{0:.3f},criterion={1},n={2},minss={3},RandomForest".format(accuracy_list.mean(),criterion,n,minss))

In [33]:
perform_random_forest('data_processed_angle_removed.csv', 5)

KFold(n_splits=5, random_state=None, shuffle=True)
------------------------------------------------------------

Random Forest
0.652,criterion=gini,n=4,minss=2,RandomForest
0.628,criterion=gini,n=4,minss=4,RandomForest
0.671,criterion=gini,n=4,minss=8,RandomForest
0.650,criterion=gini,n=4,minss=16,RandomForest
0.668,criterion=gini,n=4,minss=32,RandomForest
0.683,criterion=gini,n=8,minss=2,RandomForest
0.680,criterion=gini,n=8,minss=4,RandomForest
0.700,criterion=gini,n=8,minss=8,RandomForest
0.676,criterion=gini,n=8,minss=16,RandomForest
0.695,criterion=gini,n=8,minss=32,RandomForest
0.713,criterion=gini,n=16,minss=2,RandomForest
0.709,criterion=gini,n=16,minss=4,RandomForest
0.708,criterion=gini,n=16,minss=8,RandomForest
0.724,criterion=gini,n=16,minss=16,RandomForest
0.718,criterion=gini,n=16,minss=32,RandomForest
0.721,criterion=gini,n=32,minss=2,RandomForest
0.733,criterion=gini,n=32,minss=4,RandomForest
0.724,criterion=gini,n=32,minss=8,RandomForest
0.721,criterion=gini,n=32,minss

KeyboardInterrupt: 

0.652,
0.628,
0.671,
0.650,
0.668,
0.683,
0.680,
0.700,
0.676,
0.695,
0.713,
0.709,
0.708,
0.724,
0.718,
0.721,
0.733,
0.724,
0.721,
0.725,
0.738,
0.744,
0.731,
0.734,criterion=gini,n=64,minss=16,RandomForest
0.718,criterion=gini,n=64,minss=32,RandomForest
0.743,criterion=gini,n=128,minss=2,RandomForest
0.734,criterion=gini,n=128,minss=4,RandomForest
0.747,criterion=gini,n=128,minss=8,RandomForest
0.732,criterion=gini,n=128,minss=16,RandomForest
0.740,criterion=gini,n=128,minss=32,RandomForest
0.737,criterion=gini,n=256,minss=2,RandomForest
0.740,criterion=gini,n=256,minss=4,RandomForest
0.743,criterion=gini,n=256,minss=8,RandomForest
0.735,criterion=gini,n=256,minss=16,RandomForest
0.738,criterion=gini,n=256,minss=32,RandomForest
0.743,criterion=gini,n=512,minss=2,RandomForest
0.746,criterion=gini,n=512,minss=4,RandomForest
0.749,criterion=gini,n=512,minss=8,RandomForest
0.743,criterion=gini,n=512,minss=16,RandomForest
0.739,criterion=gini,n=512,minss=32,RandomForest
0.747,criterion=gini,n=1024,minss=2,RandomForest
0.749,criterion=gini,n=1024,minss=4,RandomForest
0.749,criterion=gini,n=1024,minss=8,RandomForest
0.744,criterion=gini,n=1024,minss=16,RandomForest
0.738,criterion=gini,n=1024,minss=32,RandomFore