In [41]:
import csv
import ml_metrics as metrics
import pickle
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

DATE_FORMAT = "%Y-%m-%d"

def get_trip_duration(checkin, checkout):
    if len(checkin)>0 and len(checkout)>0:
        a = datetime.strptime(checkin, DATE_FORMAT)
        b = datetime.strptime(checkout, DATE_FORMAT)
        delta = b - a
        return delta.days
    else:
        return 0

In [2]:
# a scoring function
def score(actual, predicted, k):
    return metrics.mapk(actual, predicted, k)

In [4]:
# read in and parse the data
X,y = [],[]
with open("/home/ryan/projects/expedia2016/data/train.csv") as f:
    reader = csv.reader(f)
    header = next(reader)
    index = dict((key,index) for index,key in enumerate(header))
    for row in reader:
        if row[-1] !=0:
            #row = dict((k,v) for k,v in zip(header, r))
            y.append(int(row[index["hotel_cluster"]]))
            xrow = [int(row[index["srch_destination_type_id"]]),
                    int(row[index["site_name"]]),
                    int(row[index["hotel_market"]]),
                    int(row[index["channel"]]),
                    int(row[index["is_mobile"]]), 
                    int(row[index["is_package"]]),
                    int(row[index["srch_adults_cnt"]]),
                    int(row[index["srch_children_cnt"]]), 
                    int(row[index["srch_rm_cnt"]]),
                    get_trip_duration(row[index['srch_ci']], row[index['srch_co']]),
                    int(row[index["cnt"]])]
            X.append(xrow)
print("Size of raw data: %d x %d" % (len(X),len(X[0])))

# this stuff is a pain to load in; let's pickle it so we can pull it back more quickly next time!
with open("/home/ryan/projects/expedia2016/data/X.pickle", "wb") as f:
    pickle.dump(X, f)
with open("/home/ryan/projects/expedia2016/data/Y.pickle", "wb") as f:
    pickle.dump(y, f)

Size of raw data: 37670293 x 11


In [30]:
# the data is just too big to do much with.  We need to subsample and try to get equal numbers of each class represented.
# TODO: Merge the subsampling with the read/parse step
def subsample(X,y, value_list):
    Xsub, ysub = [],[]
    counts = dict((k,0) for k in value_list)
    i = 0
    while i<len(X) and len(value_list)>0:
        label, row = y[i], X[i]
        i+=1
        if counts[label]<5000:
            ysub.append(label)
            Xsub.append(row)
            counts[label]+=1
        if label in value_list and counts[label]==5000:
            value_list.remove(label)
    return Xsub, ysub

# subsample
X, y = subsample(X,y, set(y))

In [32]:
# one-hot encoder breaks categorical variables into binary features for each value 
encoder = OneHotEncoder(categorical_features=[0,1,2,3])
X_onehot = encoder.fit_transform(Xsub)

In [33]:
print("Size of encoded data: %d x %d" % X_onehot.shape)

Size of encoded data: 500000 x 2089


In [35]:
# Split the data into train and test.
X_train, X_test, y_train, y_test = train_test_split(X_onehot, ysub, test_size=0.75)

In [43]:
# fit SGD model:
sgd_model = SGDClassifier(loss="log", penalty="l1", n_jobs=-1)
sgd_model.fit(X_train, y_train)
y_predicted = sgd_model.predict(X_test)
print("SCORE SGD MODEL: %s" % score([[a] for a in y_test], [[p] for p in y_predicted.tolist()], 1))

SCORE SGD MODEL: 0.125042666667


In [42]:
# fit the logistic regression model -- careful it might take a while to fit this
sgd_model = LogisticRegression(n_jobs=-1)
sgd_model.fit(X_train, y_train)
y_predicted = sgd_model.predict(X_test)
print("SCORE LOGISTIC REGRESSION MODEL: %s" % score([[a] for a in y_test], [[p] for p in y_predicted.tolist()], 1))

SCORE SGD MODEL: 0.166368
