### _author_ = https://www.kaggle.com/shubhamp05

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [2]:
### Function to merge item_info & item_pairs and drop irrelevant columns

In [2]:
def prepare_train_test(df_info, df_pairs):
    merge_1 = pd.merge(left=df_pairs, right=df_info, left_on='itemID_1', right_on='itemID',how='inner')
    merge_2 = pd.merge(left=merge_1, right=df_info,left_on='itemID_2', right_on='itemID',how='inner')
    merge_2.drop(['itemID_x','images_array_x','itemID_y','images_array_y'],axis = 1,inplace=True)
    return(merge_2)

In [30]:
### Reading item_info & item_pairs (.csv) from input location

In [3]:
item_info_train = pd.read_csv('ItemInfo_train.csv/ItemInfo_train.csv')
item_pairs_train = pd.read_csv('ItemPairs_train.csv/ItemPairs_train.csv')
item_info_test = pd.read_csv('ItemInfo_test.csv/ItemInfo_test.csv')
item_pairs_test  = pd.read_csv('ItemPairs_test.csv/ItemPairs_test.csv')

In [6]:
## Preparing train & test datasets

In [5]:
train = prepare_train_test(item_info_train,item_pairs_train)
test = prepare_train_test(item_info_test,item_pairs_test)

In [34]:
### Removing missing value
train.isnull().sum()

itemID_1                  0
itemID_2                  0
isDuplicate               0
generationMethod          0
categoryID_x              0
title_x                   1
description_x            52
attrsJSON_x          105866
price_x              283894
locationID_x              0
metroID_x           1975769
lat_x                     0
lon_x                     0
categoryID_y              0
title_y                   0
description_y            61
attrsJSON_y          105866
price_y              284231
locationID_y              0
metroID_y           1976172
lat_y                     0
lon_y                     0
dtype: int64

In [6]:
## Missing values in train
train.title_x.fillna(value=str(-1),inplace=True)
train.description_x.fillna(value=str(-1),inplace=True)
train.description_y.fillna(value=str(-1),inplace=True)
train.attrsJSON_x.fillna(value=str(-1),inplace=True)
train.attrsJSON_y.fillna(value=str(-1),inplace=True)
train.price_x.fillna(value=-1,inplace=True)
train.price_y.fillna(value=-1,inplace=True)
train.metroID_x.fillna(value = -1,inplace=True)
train.metroID_y.fillna(value = -1,inplace=True)

## Missing values in test
test.title_x.fillna(value=str(-1),inplace=True)
test.description_x.fillna(value=str(-1),inplace=True)
test.description_y.fillna(value=str(-1),inplace=True)
test.attrsJSON_x.fillna(value=str(-1),inplace=True)
test.attrsJSON_y.fillna(value=str(-1),inplace=True)
test.price_x.fillna(value=-1,inplace=True)
test.price_y.fillna(value=-1,inplace=True)
test.metroID_x.fillna(value = -1,inplace=True)
test.metroID_y.fillna(value = -1,inplace=True)

In [11]:
## Feature extraction

In [7]:
## Feature Extraction in train
train['title_'] = np.equal(train.title_x,train.title_y).astype(int)
train['category_'] = np.equal(train.categoryID_x,train.categoryID_y).astype(int)
train['description_'] = np.equal(train.description_x,train.description_y).astype(int)
train['json_'] = np.equal(train.attrsJSON_x,train.attrsJSON_y).astype(int)
train['price'] = np.equal(train.price_x,train.price_y).astype(int)
train['location_'] = np.equal(train.locationID_x,train.locationID_y).astype(int)
train['metro_'] = np.equal(train.metroID_x,train.metroID_y).astype(int)
train['lat_'] = np.equal(train.lat_x,train.lat_y).astype(int)
train['lon_'] = np.equal(train.lon_x,train.lon_y).astype(int)
train['len_title_diff'] = np.subtract(train.title_x.str.len().astype(int),train.title_y.str.len().astype(int))
train['len_desc_diff'] = np.subtract(train.description_x.str.len().astype(int),train.description_y.str.len().astype(int))
train['len_json_diff'] = np.subtract(train.attrsJSON_x.str.len().astype(int),train.attrsJSON_y.str.len().astype(int))
train['price_diff'] = np.subtract(train.price_x,train.price_y)
train['len_lat_diff'] = np.subtract(train.lat_x,train.lat_y)
train['len_lon_diff'] = np.subtract(train.lon_x,train.lon_y)

## Feature extraction in test
test['title_'] = np.equal(test.title_x,test.title_y).astype(int)
test['category_'] = np.equal(test.categoryID_x,test.categoryID_y).astype(int)
test['description_'] = np.equal(test.description_x,test.description_y).astype(int)
test['json_'] = np.equal(test.attrsJSON_x,test.attrsJSON_y).astype(int)
test['price'] = np.equal(test.price_x,test.price_y).astype(int)
test['location_'] = np.equal(test.locationID_x,test.locationID_y).astype(int)
test['metro_'] = np.equal(test.metroID_x,test.metroID_y).astype(int)
test['lat_'] = np.equal(test.lat_x,test.lat_y).astype(int)
test['lon_'] = np.equal(test.lon_x,test.lon_y).astype(int)
test['len_title_diff'] = np.subtract(test.title_x.str.len().astype(int),test.title_y.str.len().astype(int))
test['len_desc_diff'] = np.subtract(test.description_x.str.len().astype(int),test.description_y.str.len().astype(int))
test['len_json_diff'] = np.subtract(test.attrsJSON_x.str.len().astype(int),test.attrsJSON_y.str.len().astype(int))
test['price_diff'] = np.subtract(test.price_x,test.price_y)
test['len_lat_diff'] = np.subtract(test.lat_x,test.lat_y)
test['len_lon_diff'] = np.subtract(test.lon_x,test.lon_y)

In [8]:
features = ['title_','category_','description_','json_','price','location_','metro_','lat_','lon_','len_title_diff','len_desc_diff','len_json_diff','price_diff','len_lat_diff','len_lon_diff']

In [41]:
## Applying Random forest classifier

In [12]:
y = train.isDuplicate.values
train.drop('isDuplicate',axis=1,inplace=True)

In [43]:
rfc = RandomForestClassifier(n_estimators=1000,max_features="log2")

In [9]:
#rfc.fit(train[features],y)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gbm = GradientBoostingClassifier(n_estimators=1000)

In [13]:
gbm.fit(train[features],y)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [15]:
preds = gbm.predict_proba(test[features])[:,1]