In [10]:
import datetime
import pandas as pd
import numpy as np
from operator import itemgetter
import zipfile
import time

In [11]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random
random.seed(1999)

In [12]:
def feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [13]:
def features_importance(gbm, features):
    feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

In [14]:
def intersect(a, b):
    return list(set(a) & set(b))

In [15]:
def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')

In [27]:
def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 8
    subsample = 0.85
    colsample_bytree = 0.85
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 1000
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = features_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


In [17]:
def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [18]:
def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

In [19]:
def prep_train():
    testing = 0
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

    print("Load ItemPairs_train.csv")
    pairs = pd.read_csv("ItemPairs_train.csv", dtype=types1)
    # Add 'id' column for easy merge
    print("Load ItemInfo_train.csv")
    items = pd.read_csv("ItemInfo_train.csv", dtype=types2)
    items.fillna(-1, inplace=True)
    location = pd.read_csv("Location.csv")
    category = pd.read_csv("Category.csv")

    train = pairs
    train = train.drop(['generationMethod'], axis=1)

    print('Add text features...')
    items['len_title'] = items['title'].str.len()
    items['len_description'] = items['description'].str.len()
    items['len_attrsJSON'] = items['attrsJSON'].str.len()

    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
    item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

    item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1',
            'len_title': 'len_title_1',
			'len_description': 'len_description_1',
			'len_attrsJSON': 'len_attrsJSON_1',
        }
    )

    # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2',
            'len_title': 'len_title_2',
			'len_description': 'len_description_2',
			'len_attrsJSON': 'len_attrsJSON_2'
        }
    )

    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)

    # print(train.describe())
    print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return train


In [20]:
def prep_test():
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'id': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

    print("Load ItemPairs_test.csv")
    pairs = pd.read_csv("ItemPairs_test.csv", dtype=types1)
    print("Load ItemInfo_testcsv")
    items = pd.read_csv("ItemInfo_test.csv", dtype=types2)
    items.fillna(-1, inplace=True)
    location = pd.read_csv("Location.csv")
    category = pd.read_csv("Category.csv")

    train = pairs

    print('Add text features...')
    items['len_title'] = items['title'].str.len()
    items['len_description'] = items['description'].str.len()
    items['len_attrsJSON'] = items['attrsJSON'].str.len()
    
    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
    item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

    item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1',
            'len_title': 'len_title_1',
			'len_description': 'len_description_1',
			'len_attrsJSON': 'len_attrsJSON_1'
        }
    )

    # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon',
    'len_title', 'len_description', 'len_attrsJSON']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2',
            'len_title': 'len_title_2',
			'len_description': 'len_description_2',
			'len_attrsJSON': 'len_attrsJSON_2',
        }
    )

    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)

    # print(train.describe())
    print('Create test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return train


In [28]:
def read_test_train():
    train = prep_train()
    test = prep_test()
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    # Get only subset of data
    if 1:
        len_old = len(train.index)
        train = train.sample(frac=0.5)
        len_new = len(train.index)
        print('Reduce train from {} to {}'.format(len_old, len_new))
    features = get_features(train, test)
    return train, test, features


train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
print('Real score = {}'.format(score))
create_submission(score, test, test_prediction)

Load ItemPairs_train.csv
Load ItemInfo_train.csv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create train data time: 44.44 seconds
Load ItemPairs_test.csv
Load ItemInfo_testcsv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create test data time: 13.41 seconds
Reduce train from 2991396 to 1495698
Length of train:  1495698
Length of test:  1044196
Features [29]: ['categoryID_1', 'categoryID_2', 'categoryID_same', 'lat_1', 'lat_2', 'lat_same', 'len_attrsJSON_1', 'len_attrsJSON_2', 'len_description_1', 'len_description_2', 'len_title_1', 'len_title_2', 'locationID_1', 'locationID_2', 'locationID_same', 'lon_1', 'lon_2', 'lon_same', 'metroID_1', 'metroID_2', 'metroID_same', 'parentCategoryID_1', 'parentCategoryID_2', 'price_1', 'price_2', 'price_same', 'regionID_1', 'regionID_2', 'regionID_same']
XGBoost params. ETA: 0.1, MAX_DEPTH: 8, SUBSAMPLE: 0.85, COLSAMPLE_BY_TREE: 0.85


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.774358	eval-auc:0.774320
[1]	train-auc:0.783204	eval-auc:0.783343
[2]	train-auc:0.786949	eval-auc:0.787090
[3]	train-auc:0.790368	eval-auc:0.790229
[4]	train-auc:0.794935	eval-auc:0.794526
[5]	train-auc:0.798645	eval-auc:0.797900
[6]	train-auc:0.800723	eval-auc:0.799819
[7]	train-auc:0.802009	eval-auc:0.800920
[8]	train-auc:0.803558	eval-auc:0.802494
[9]	train-auc:0.804779	eval-auc:0.803700
[10]	train-auc:0.806055	eval-auc:0.804941
[11]	train-auc:0.806886	eval-auc:0.805789
[12]	train-auc:0.808215	eval-auc:0.807150
[13]	train-auc:0.809033	eval-auc:0.807987
[14]	train-auc:0.809883	eval-auc:0.808723
[15]	train-auc:0.810670	eval-auc:0.809439
[16]	train-auc:0.812504	eval-auc:0.811253
[17]	train-auc:0.813194	eval-auc:0.811906
[18]	train-auc:0.814568	eval-auc:0.813269
[19]	train-auc:0.815132	eval-auc:0.813779
[20]	train-auc:0.815955	eval-auc:0.814562
[21]	train-auc:0.816582	eval-auc:0.815110
[22]	train-auc:0.817263	eva

Validating...
Check error value: 0.901239
Importance array:  [('price_2', 15618), ('len_description_2', 15544), ('len_description_1', 15405), ('len_attrsJSON_2', 13966), ('price_1', 13784), ('len_attrsJSON_1', 12936), ('categoryID_1', 11096), ('len_title_1', 9987), ('len_title_2', 9842), ('lat_1', 9766), ('lon_1', 9168), ('lat_2', 8703), ('lon_2', 8585), ('locationID_1', 4755), ('locationID_2', 4278), ('parentCategoryID_1', 3890), ('metroID_1', 3874), ('metroID_2', 3565), ('regionID_1', 2703), ('price_same', 1722), ('categoryID_2', 1366), ('lon_same', 883), ('regionID_2', 826), ('locationID_same', 824), ('parentCategoryID_2', 565), ('metroID_same', 478), ('regionID_same', 319), ('lat_same', 267)]
Predict test set...
Training time: 28.98 minutes
Real score = 0.9012387917375388
Writing submission:  submission_0.901238791738_2016-07-02-17-44.csv


### Image hashing technique

#### Example:
from PIL import Image<br>
import imagehash<br>
hash = imagehash.average_hash(Image.open('test.png'))<br>
print(hash)<br>
d879f8f89b1bbf<br>
otherhash = imagehash.average_hash(Image.open('other.bmp'))<br>
print(otherhash)<br>
ffff3720200ffff<br>
print(hash == otherhash)<br>
False<br>
print(hash - otherhash)<br>
36<br>