In [2]:
import datetime
import pandas as pd
import numpy as np
from operator import itemgetter
import zipfile
import time

In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random
random.seed(1999)

In [4]:
def feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [5]:
def features_importance(gbm, features):
    feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

In [6]:
def intersect(a, b):
    return list(set(a) & set(b))

In [7]:
def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')

In [8]:
def run_default_test(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 8
    subsample = 0.85
    colsample_bytree = 0.85
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 1000
    early_stopping_rounds = 20
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = features_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


In [9]:
def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [10]:
def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

In [11]:
def prep_train():
    testing = 0
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

    print("Load ItemPairs_train.csv")
    pairs = pd.read_csv("ItemPairs_train.csv", dtype=types1)
    # Add 'id' column for easy merge
    print("Load ItemInfo_train.csv")
    items = pd.read_csv("ItemInfo_train.csv", dtype=types2)
    items.fillna(-1, inplace=True)
    location = pd.read_csv("Location.csv")
    category = pd.read_csv("Category.csv")

    train = pairs
    train = train.drop(['generationMethod'], axis=1)

    print('Add text features...')
    items['len_title'] = items['title'].str.len()
    items['len_description'] = items['description'].str.len()
    items['len_attrsJSON'] = items['attrsJSON'].str.len()

    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
    item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

    item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1',
            'len_title': 'len_title_1',
			'len_description': 'len_description_1',
			'len_attrsJSON': 'len_attrsJSON_1',
        }
    )

    # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2',
            'len_title': 'len_title_2',
			'len_description': 'len_description_2',
			'len_attrsJSON': 'len_attrsJSON_2'
        }
    )

    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)

    # print(train.describe())
    print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return train


In [12]:
def prep_test():
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'id': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

    print("Load ItemPairs_test.csv")
    pairs = pd.read_csv("ItemPairs_test.csv", dtype=types1)
    print("Load ItemInfo_testcsv")
    items = pd.read_csv("ItemInfo_test.csv", dtype=types2)
    items.fillna(-1, inplace=True)
    location = pd.read_csv("Location.csv")
    category = pd.read_csv("Category.csv")

    train = pairs

    print('Add text features...')
    items['len_title'] = items['title'].str.len()
    items['len_description'] = items['description'].str.len()
    items['len_attrsJSON'] = items['attrsJSON'].str.len()
    
    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
    'len_title', 'len_description', 'len_attrsJSON']]
    item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
    item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

    item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1',
            'len_title': 'len_title_1',
			'len_description': 'len_description_1',
			'len_attrsJSON': 'len_attrsJSON_1'
        }
    )

    # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon',
    'len_title', 'len_description', 'len_attrsJSON']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2',
            'len_title': 'len_title_2',
			'len_description': 'len_description_2',
			'len_attrsJSON': 'len_attrsJSON_2',
        }
    )

    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)
    
    # print(train.describe())
    print('Create test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return train


In [12]:
def read_test_train():
    train = prep_train()
    train.to_csv('trainDF.csv', index=False)
    test = prep_test()
    test.to_csv('testDF.csv', index=False)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    # Get only subset of data
    if 1:
        len_old = len(train.index)
        train = train.sample(frac=0.5)
        len_new = len(train.index)
        print('Reduce train from {} to {}'.format(len_old, len_new))
    features = get_features(train, test)
    return train, test, features


train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
print('Real score = {}'.format(score))
create_submission(score, test, test_prediction)

Load ItemPairs_train.csv
Load ItemInfo_train.csv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create train data time: 39.54 seconds
Load ItemPairs_test.csv
Load ItemInfo_testcsv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create test data time: 14.33 seconds
Reduce train from 2991396 to 1495698
Length of train:  1495698
Length of test:  1044196
Features [29]: ['categoryID_1', 'categoryID_2', 'categoryID_same', 'lat_1', 'lat_2', 'lat_same', 'len_attrsJSON_1', 'len_attrsJSON_2', 'len_description_1', 'len_description_2', 'len_title_1', 'len_title_2', 'locationID_1', 'locationID_2', 'locationID_same', 'lon_1', 'lon_2', 'lon_same', 'metroID_1', 'metroID_2', 'metroID_same', 'parentCategoryID_1', 'parentCategoryID_2', 'price_1', 'price_2', 'price_same', 'regionID_1', 'regionID_2', 'regionID_same']
XGBoost params. ETA: 0.1, MAX_DEPTH: 8, SUBSAMPLE: 0.85, COLSAMPLE_BY_TREE: 0.85


Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.772641	eval-auc:0.771076
[1]	train-auc:0.785503	eval-auc:0.784233
[2]	train-auc:0.792174	eval-auc:0.790701
[3]	train-auc:0.793620	eval-auc:0.792064
[4]	train-auc:0.796062	eval-auc:0.794564
[5]	train-auc:0.798749	eval-auc:0.797281
[6]	train-auc:0.799856	eval-auc:0.798279
[7]	train-auc:0.800874	eval-auc:0.799355
[8]	train-auc:0.803386	eval-auc:0.801866
[9]	train-auc:0.806160	eval-auc:0.804587
[10]	train-auc:0.807124	eval-auc:0.805378
[11]	train-auc:0.808146	eval-auc:0.806378
[12]	train-auc:0.809361	eval-auc:0.807648
[13]	train-auc:0.811512	eval-auc:0.809659
[14]	train-auc:0.812611	eval-auc:0.810673
[15]	train-auc:0.813650	eval-auc:0.811648
[16]	train-auc:0.814751	eval-auc:0.812738
[17]	train-auc:0.815800	eval-auc:0.813862
[18]	train-auc:0.817268	eval-auc:0.815340
[19]	train-auc:0.817910	eval-auc:0.816002
[20]	train-auc:0.819052	eval-auc:0.817183
[21]	train-auc:0.819650	eval-auc:0.817759
[22]	train-auc:0.820443	eva

KeyboardInterrupt: 

### Image hashing technique

#### Example:
from PIL import Image<br>
import imagehash<br>
hash = imagehash.average_hash(Image.open('test.png'))<br>
print(hash)<br>
d879f8f89b1bbf<br>
otherhash = imagehash.average_hash(Image.open('other.bmp'))<br>
print(otherhash)<br>
ffff3720200ffff<br>
print(hash == otherhash)<br>
False<br>
print(hash - otherhash)<br>
36<br>

In [13]:
trainDF=pd.read_csv("trainDF.csv")

In [14]:
trainDF.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,...,len_attrsJSON_2,parentCategoryID_2,regionID_2,price_same,locationID_same,categoryID_same,regionID_same,metroID_same,lat_same,lon_same
0,1,4112648,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,...,27.0,1,648070,1,1,1,1,1,1,1
1,3,1991275,1,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,...,57.0,1,637680,0,1,1,1,1,1,1
2,4,1223296,0,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,...,25.0,6,640310,1,1,1,1,1,1,1
3,7,1058851,1,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,...,31.0,6,662000,1,1,1,1,1,0,0
4,8,2161930,1,39,500.0,624360,-1.0,55.77717,37.586194,14.0,...,35.0,7,624300,0,1,1,1,1,1,1


In [15]:
print(features)

NameError: name 'features' is not defined

In [16]:
testDF=pd.read_csv("testDF.csv")

In [17]:
testDF.head()

Unnamed: 0,id,itemID_1,itemID_2,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,...,len_attrsJSON_2,parentCategoryID_2,regionID_2,price_same,locationID_same,categoryID_same,regionID_same,metroID_same,lat_same,lon_same
0,0,5,4670875,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,...,23.0,113,637640,1,1,1,1,1,1,1
1,1,5,787210,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,...,23.0,113,637640,1,1,1,1,1,1,1
2,2,6,1705280,10,-1.0,637160,-1.0,59.645846,33.513035,14.0,...,38.0,1,636370,0,1,1,1,1,1,1
3,3,11,3020777,101,1300.0,650400,302.0,55.817608,49.097646,27.0,...,60.0,6,650130,0,1,1,1,1,1,1
4,4,23,5316130,27,15000.0,637640,500769.0,55.82708,37.437604,48.0,...,89.0,5,637640,1,1,1,1,1,1,1


In [18]:
types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
    }

types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

In [19]:
print("Load ItemPairs_train.csv")
pairs = pd.read_csv("ItemPairs_train.csv", dtype=types1)
# Add 'id' column for easy merge
print("Load ItemInfo_train.csv")
items = pd.read_csv("ItemInfo_train.csv", dtype=types2)
items.fillna(-1, inplace=True)
location = pd.read_csv("Location.csv")
category = pd.read_csv("Category.csv")

Load ItemPairs_train.csv
Load ItemInfo_train.csv


In [25]:
pairs.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate,generationMethod
0,1,4112648,1,1
1,3,1991275,1,1
2,4,1223296,0,1
3,7,1058851,1,1
4,8,2161930,1,1


In [27]:
items.head()

Unnamed: 0,itemID,categoryID,title,description,images_array,attrsJSON,price,locationID,metroID,lat,lon
0,1,81,Продам Камаз 6520,Продам Камаз 6520 20 тонн,"1064094, 5252822, 6645873, 6960145, 9230265","{""Вид техники"":""Грузовики""}",300000.0,648140,-1.0,64.686946,30.815924
1,3,14,Yamaha r6,Весь в тюнинге.,"11919573, 14412228, 3204180, 6646877","{""Вид техники"":""Мотоциклы"", ""Вид мотоцикла"":""С...",300000.0,639040,-1.0,55.678037,37.256548
2,4,84,iPhone 3gs 8gb,"Телефон в хорошем состоянии, трещин и сколов н...","14384831, 6102021","{""Вид телефона"":""iPhone""}",3500.0,640650,-1.0,56.239398,43.460458
3,7,84,Xiaomi Mi4 3гб RAM + 16гб ROM белый,"Отличный подарок на новый год от ""китайской ap...",-1,"{""Вид телефона"":""Другие марки""}",13500.0,662210,-1.0,55.77717,37.586194
4,8,39,Лыжные ботинки,"Лыжные ботинки в хорошем состоянии, 34 размер","13718854, 4787310","{""Вид товара"":""Зимние виды спорта""}",500.0,624360,-1.0,55.77717,37.586194


In [28]:
location.head()

Unnamed: 0,locationID,regionID
0,621551,621550
1,621552,621550
2,621553,621550
3,621554,621550
4,621555,621550


In [29]:
category.head()

Unnamed: 0,categoryID,parentCategoryID
0,9,1
1,10,1
2,11,1
3,14,1
4,19,2


In [24]:
train = pairs
train = train.drop(['generationMethod'], axis=1)

In [26]:
train.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate
0,1,4112648,1
1,3,1991275,1
2,4,1223296,0
3,7,1058851,1
4,8,2161930,1


In [30]:
print('Add text features...')
items['len_title'] = items['title'].str.len()
items['len_description'] = items['description'].str.len()
items['len_attrsJSON'] = items['attrsJSON'].str.len()

print('Merge item 1...')
item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrsJSON']]
item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

Add text features...
Merge item 1...


In [31]:
item1.head()

Unnamed: 0,itemID,categoryID,price,locationID,metroID,lat,lon,len_title,len_description,len_attrsJSON,parentCategoryID,regionID
2123,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,25.0,27.0,1,648070
1402,3,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,15.0,57.0,1,637680
1540,4,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,239.0,25.0,6,640310
3386,7,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,801.0,31.0,6,662000
288,8,39,500.0,624360,-1.0,55.77717,37.586194,14.0,45.0,35.0,7,624300


In [32]:
item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1',
            'len_title': 'len_title_1',
			'len_description': 'len_description_1',
			'len_attrsJSON': 'len_attrsJSON_1',
        }
    )


In [33]:
item1.head()

Unnamed: 0,itemID_1,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,len_description_1,len_attrsJSON_1,parentCategoryID_1,regionID_1
2123,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,25.0,27.0,1,648070
1402,3,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,15.0,57.0,1,637680
1540,4,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,239.0,25.0,6,640310
3386,7,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,801.0,31.0,6,662000
288,8,39,500.0,624360,-1.0,55.77717,37.586194,14.0,45.0,35.0,7,624300


In [34]:
train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

In [35]:
train.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,len_description_1,len_attrsJSON_1,parentCategoryID_1,regionID_1
2123,1,4112648,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,25.0,27.0,1,648070
1402,3,1991275,1,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,15.0,57.0,1,637680
1540,4,1223296,0,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,239.0,25.0,6,640310
3386,7,1058851,1,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,801.0,31.0,6,662000
288,8,2161930,1,39,500.0,624360,-1.0,55.77717,37.586194,14.0,45.0,35.0,7,624300


In [36]:
print('Merge item 2...')
item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrsJSON']]
item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)
item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2',
            'len_title': 'len_title_2',
			'len_description': 'len_description_2',
			'len_attrsJSON': 'len_attrsJSON_2'
    }
)

# Add item 2 data
train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

Merge item 2...


In [37]:
train.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,...,price_2,locationID_2,metroID_2,lat_2,lon_2,len_title_2,len_description_2,len_attrsJSON_2,parentCategoryID_2,regionID_2
2123,1,4112648,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,...,300000.0,648140,-1.0,64.686946,30.815924,17.0,25.0,27.0,1,648070
1402,3,1991275,1,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,...,330000.0,639040,-1.0,55.678037,37.256548,9.0,29.0,57.0,1,637680
1540,4,1223296,0,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,...,3500.0,640650,-1.0,56.239398,43.460458,15.0,64.0,25.0,6,640310
3386,7,1058851,1,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,...,13500.0,662210,-1.0,56.135459,47.235484,44.0,843.0,31.0,6,662000
288,8,2161930,1,39,500.0,624360,-1.0,55.77717,37.586194,14.0,...,600.0,624360,-1.0,55.77717,37.586194,15.0,38.0,35.0,7,624300


In [38]:
# Create same arrays
print('Create same arrays')
train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)

Create same arrays


In [39]:
train.to_csv('trainDF.csv', index=False)

In [20]:
trainDF=pd.read_csv("trainDF.csv")
trainDF.head()

Unnamed: 0,itemID_1,itemID_2,isDuplicate,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,...,len_attrsJSON_2,parentCategoryID_2,regionID_2,price_same,locationID_same,categoryID_same,regionID_same,metroID_same,lat_same,lon_same
0,1,4112648,1,81,300000.0,648140,-1.0,64.686946,30.815924,17.0,...,27.0,1,648070,1,1,1,1,1,1,1
1,3,1991275,1,14,300000.0,639040,-1.0,55.678037,37.256548,9.0,...,57.0,1,637680,0,1,1,1,1,1,1
2,4,1223296,0,84,3500.0,640650,-1.0,56.239398,43.460458,14.0,...,25.0,6,640310,1,1,1,1,1,1,1
3,7,1058851,1,84,13500.0,662210,-1.0,55.77717,37.586194,35.0,...,31.0,6,662000,1,1,1,1,1,0,0
4,8,2161930,1,39,500.0,624360,-1.0,55.77717,37.586194,14.0,...,35.0,7,624300,0,1,1,1,1,1,1


In [41]:
test = prep_test()
test.to_csv('testDF.csv', index=False)

Load ItemPairs_test.csv
Load ItemInfo_testcsv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create test data time: 14.76 seconds


In [19]:
testDF=pd.read_csv("testDF.csv")
testDF.head()

Unnamed: 0,id,itemID_1,itemID_2,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,...,len_attrsJSON_2,parentCategoryID_2,regionID_2,price_same,locationID_same,categoryID_same,regionID_same,metroID_same,lat_same,lon_same
0,0,5,4670875,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,...,23.0,113,637640,1,1,1,1,1,1,1
1,1,5,787210,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,...,23.0,113,637640,1,1,1,1,1,1,1
2,2,6,1705280,10,-1.0,637160,-1.0,59.645846,33.513035,14.0,...,38.0,1,636370,0,1,1,1,1,1,1
3,3,11,3020777,101,1300.0,650400,302.0,55.817608,49.097646,27.0,...,60.0,6,650130,0,1,1,1,1,1,1
4,4,23,5316130,27,15000.0,637640,500769.0,55.82708,37.437604,48.0,...,89.0,5,637640,1,1,1,1,1,1,1


In [46]:
testDF = testDF.drop(['id'], axis=1)

In [47]:
testDF.head()

Unnamed: 0,itemID_1,itemID_2,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,len_title_1,len_description_1,...,len_attrsJSON_2,parentCategoryID_2,regionID_2,price_same,locationID_same,categoryID_same,regionID_same,metroID_same,lat_same,lon_same
0,5,4670875,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,165.0,...,23.0,113,637640,1,1,1,1,1,1,1
1,5,787210,115,-1.0,637640,500292.0,55.760211,37.577211,38.0,165.0,...,23.0,113,637640,1,1,1,1,1,1,1
2,6,1705280,10,-1.0,637160,-1.0,59.645846,33.513035,14.0,68.0,...,38.0,1,636370,0,1,1,1,1,1,1
3,11,3020777,101,1300.0,650400,302.0,55.817608,49.097646,27.0,128.0,...,60.0,6,650130,0,1,1,1,1,1,1
4,23,5316130,27,15000.0,637640,500769.0,55.82708,37.437604,48.0,2431.0,...,89.0,5,637640,1,1,1,1,1,1,1


In [48]:
trainval = list(trainDF.columns.values)
testval = list(testDF.columns.values)

In [49]:
print(trainval)

['itemID_1', 'itemID_2', 'isDuplicate', 'categoryID_1', 'price_1', 'locationID_1', 'metroID_1', 'lat_1', 'lon_1', 'len_title_1', 'len_description_1', 'len_attrsJSON_1', 'parentCategoryID_1', 'regionID_1', 'categoryID_2', 'price_2', 'locationID_2', 'metroID_2', 'lat_2', 'lon_2', 'len_title_2', 'len_description_2', 'len_attrsJSON_2', 'parentCategoryID_2', 'regionID_2', 'price_same', 'locationID_same', 'categoryID_same', 'regionID_same', 'metroID_same', 'lat_same', 'lon_same']


In [50]:
print(testval)

['itemID_1', 'itemID_2', 'categoryID_1', 'price_1', 'locationID_1', 'metroID_1', 'lat_1', 'lon_1', 'len_title_1', 'len_description_1', 'len_attrsJSON_1', 'parentCategoryID_1', 'regionID_1', 'categoryID_2', 'price_2', 'locationID_2', 'metroID_2', 'lat_2', 'lon_2', 'len_title_2', 'len_description_2', 'len_attrsJSON_2', 'parentCategoryID_2', 'regionID_2', 'price_same', 'locationID_same', 'categoryID_same', 'regionID_same', 'metroID_same', 'lat_same', 'lon_same']


In [51]:
features = intersect(trainval, testval)    

In [52]:
print(features)

['len_description_1', 'parentCategoryID_1', 'categoryID_1', 'metroID_1', 'lon_1', 'itemID_2', 'lat_same', 'price_same', 'len_attrsJSON_2', 'len_title_2', 'locationID_2', 'lat_2', 'len_attrsJSON_1', 'price_1', 'metroID_2', 'metroID_same', 'len_title_1', 'lon_2', 'itemID_1', 'locationID_1', 'regionID_same', 'regionID_2', 'parentCategoryID_2', 'categoryID_2', 'locationID_same', 'lon_same', 'len_description_2', 'regionID_1', 'lat_1', 'price_2', 'categoryID_same']


In [None]:
output.remove('itemID_1')
output.remove('itemID_2')

In [21]:
import pickle

In [22]:
whash = pickle.load( open('whash_haar.pkl', 'rb') )

In [23]:
type(whash)

dict

In [57]:
print(whash)

In [61]:
list(whash.items())

[(2, 8034173447716605444),
 (5, 11459270170091060735),
 (7, 18400019274820157450),
 (8, 704357531391),
 (9, 18337108463122644992),
 (10, 9186353938079423488),
 (12, 1447632467759242235),
 (13, 18230288159769886720),
 (14, 1096414436767112451),
 (15, 2827763845445877955),
 (16, 144877819258855),
 (17, 217018519141629823),
 (18, 18319636374037735544),
 (19, 17852480994431532799),
 (21, 2957538369155829519),
 (23, 18158513709786193919),
 (25, 9277416325663096575),
 (26, 11497552417792240955),
 (27, 281454566634240),
 (28, 1947528676405403647),
 (29, 216237099198511160),
 (30, 3528632086856204287),
 (31, 2738188575420973055),
 (33, 16267425998220551159),
 (34, 4646726337522580540),
 (36, 2323362894822931968),
 (37, 17938084571774390047),
 (38, 1827879201435262),
 (39, 9293387280536042752),
 (40, 18383584401478680960),
 (41, 8477776329719326),
 (43, 4341043554206644224),
 (44, 16411611818044035120),
 (48, 16623645725183),
 (49, 579847455034572799),
 (50, 18413812607413713919),
 (51, 1387109

In [25]:
print(len(whash.items()))

10824199


In [32]:
imageHashDF=pd.DataFrame([whash])

In [35]:
imageHashDF=pd.DataFrame([[key,value] for key,value in whash.items()],columns=["id","HashValue"])

In [36]:
imageHashDF.head()

Unnamed: 0,id,HashValue
0,2,8034173447716605444
1,5,11459270170091060735
2,7,18400019274820157450
3,8,704357531391
4,9,18337108463122644992


In [None]:
1064094

In [40]:
imageHashDF[imageHashDF.id==1064094]

Unnamed: 0,id,HashValue
785701,1064094,1081147450901131263
