In [1]:
#
# SAMPLE RANDOMLY AND SPLIT TRAINING DATA
#
import random
import csv

def split_file(fname, N):
    # open a N files named like fname but with a .i.csv suffix
    files = [csv.writer(open(fname + ".%d.csv" % i, "w")) for i in range(1,N+1)]
    try:
        with open(fname) as f:
            reader = csv.reader(f)
            header = next(reader)
            index = dict((key,index) for index,key in enumerate(header))
            for file in files:
                file.writerow(header)
            for row in reader:
                if row[index["is_booking"]]=="1":
                    pick = random.choice(files)
                    pick.writerow(row)
    finally:
        for file in files:
            file.close()

In [2]:
import csv
import ml_metrics as metrics
import pickle
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV
from datetime import datetime

# Helper functions
DATE_FORMAT = "%Y-%m-%d"

def get_trip_duration(checkin, checkout):
    if len(checkin)>0 and len(checkout)>0:
        a = datetime.strptime(checkin, DATE_FORMAT)
        b = datetime.strptime(checkout, DATE_FORMAT)
        delta = b - a
        return delta.days
    else:
        return 0
    
# a scoring function
def score(actual, predicted, k):
    return metrics.mapk(actual, predicted, k)

# TODO: Merge the subsampling with the read/parse step
def subsample(X,y, value_list):
    Xsub, ysub = [],[]
    counts = dict((k,0) for k in value_list)
    i = 0
    while i<len(X) and len(value_list)>0:
        label, row = y[i], X[i]
        i+=1
        if counts[label]<5000:
            ysub.append(label)
            Xsub.append(row)
            counts[label]+=1
        if label in value_list and counts[label]==5000:
            value_list.remove(label)
    return Xsub, ysub

In [5]:
#
# read in and parse the data
#
X,y = [],[]
with open("./data/train.csv") as f:
    reader = csv.reader(f)
    header = next(reader)
    
    # build map from name -> index
    index = dict((key,index) for index,key in enumerate(header))
    
    # map records into an array.  Arrange so that categoricial variables appear first in each row.
    for row in reader:
        if row[index["is_booking"]]=="1":
            y.append(int(row[index["hotel_cluster"]]))
            xrow = [int(row[index["srch_destination_id"]]),
                    int(row[index["hotel_market"]]),
                    int(row[index["hotel_country"]]),
                    int(row[index["channel"]]),
                    int(row[index["is_package"]]),
                    int(row[index["srch_adults_cnt"]]) + int(row[index["srch_children_cnt"]]), 
                    int(row[index["srch_rm_cnt"]])]
            X.append(xrow)
    NUMBER_CATEGORICAL = 7

print("Size of raw data: %d x %d" % (len(X),len(X[0])))

# downsample
X, y = subsample(X, y, set(y))
print("Size of downsampled data: %d x %d" % (len(X),len(X[0])))

# one-hot encoder breaks categorical variables into binary features for each value 
encoder = OneHotEncoder(categorical_features=[i for i in range(0, 6)])
X_onehot = encoder.fit_transform(X)
print("Size of encoded data: %d x %d" % X_onehot.shape)

# Split the data into train and test.
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25)
print("Size of training data: %d x %d" % X_train.shape)
print("Size of test data: %d x %d" % X_train.shape)

Size of raw data: 3000693 x 7
Size of downsampled data: 495148 x 7
Size of encoded data: 495148 x 23554


In [15]:
# fit SGD model:
# SGD has a tunable parameter called "alpha" which will affect how the model performs
# We should cross-validation to find the best value of "alpha"
#
sgd_model = SGDClassifier(loss="log", penalty="l1", n_jobs=-1)
parameters = { 'penalty':("l1", "l2"), 'alpha': [10.0**(-i) for i in range(1,7)] }

# grid search
best_sgd = GridSearchCV(sgd_model, parameters, cv=4)
best_sgd.fit(X_train, y_train)
for report in best_sgd.grid_scores_:
    print(report)

# test performance on the test data with various score regimes.
y_values = sorted(list(set(y)))
for i in range(1,6):
    y_predicted = [sorted(y_values, key=lambda i: row[i], reverse=True)[:i] for row in best_sgd.predict_proba(X_test)]
    print("SCORE SGD MODEL with k=%d : %s" % (i, score([[a] for a in y_test], y_predicted, i)))

mean: 0.01009, std: 0.00010, params: {'alpha': 0.1, 'penalty': 'l1'}
mean: 0.08999, std: 0.00086, params: {'alpha': 0.1, 'penalty': 'l2'}
mean: 0.01913, std: 0.00203, params: {'alpha': 0.01, 'penalty': 'l1'}
mean: 0.10038, std: 0.00078, params: {'alpha': 0.01, 'penalty': 'l2'}
mean: 0.07934, std: 0.00155, params: {'alpha': 0.001, 'penalty': 'l1'}
mean: 0.12760, std: 0.00109, params: {'alpha': 0.001, 'penalty': 'l2'}
mean: 0.13835, std: 0.00113, params: {'alpha': 0.0001, 'penalty': 'l1'}
mean: 0.16857, std: 0.00015, params: {'alpha': 0.0001, 'penalty': 'l2'}
mean: 0.18379, std: 0.00101, params: {'alpha': 1e-05, 'penalty': 'l1'}
mean: 0.18425, std: 0.00171, params: {'alpha': 1e-05, 'penalty': 'l2'}
mean: 0.16026, std: 0.00454, params: {'alpha': 1e-06, 'penalty': 'l1'}
mean: 0.14691, std: 0.00561, params: {'alpha': 1e-06, 'penalty': 'l2'}
SCORE SGD MODEL with k=1 : 0.187628749384
SCORE SGD MODEL with k=2 : 0.244751064328
SCORE SGD MODEL with k=3 : 0.273803657358
SCORE SGD MODEL with k=4 :