In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [2]:
item_info_train = pd.read_csv('ItemInfo_train.csv/ItemInfo_train.csv')
item_pairs_train = pd.read_csv('ItemPairs_train.csv/ItemPairs_train.csv')
item_info_test = pd.read_csv('ItemInfo_test.csv/ItemInfo_test.csv')
item_pairs_test  = pd.read_csv('ItemPairs_test.csv/ItemPairs_test.csv')
category = pd.read_csv('Category.csv/Category.csv')
location  = pd.read_csv('Location.csv/Location.csv')

In [24]:
def prepare_train_test(df_info, df_pairs):
    merge_1 = pd.merge(left=df_pairs, right=df_info, left_on='itemID_1', right_on='itemID',how='inner')
    merge_2 = pd.merge(left=merge_1, right=df_info,left_on='itemID_2', right_on='itemID',how='inner')
    merge_2.drop(['itemID_x','images_array_x','itemID_y','images_array_y'],axis = 1,inplace=True)
    return(merge_2)

### Preparing train & test

In [25]:
train = prepare_train_test(item_info_train,item_pairs_train)
test = prepare_train_test(item_info_test,item_pairs_test)

### Handling missing values

In [5]:
def missing_values(df):
    df.title_x.dropna(inplace=True)
    df.description_x.fillna(value=pd.Series(test.title_x.values),inplace=True)
    df.description_y.fillna(value=pd.Series(test.title_y.values),inplace=True)
    df.drop(['images_array_x','attrsJSON_x','price_x','images_array_y','attrsJSON_y','price_y'],axis=1,inplace=True)
    return(df) 

In [6]:
def feature_add(df):
    df['title_len_x'] = [len(str(m)) for m in df.title_x.values]
    df['description_len_x'] = [len(str(m)) for m in df.description_x.values]
    df['title_len_y'] = [len(str(m)) for m in df.title_x.values]
    df['description_len_y'] = [len(str(m)) for m in df.description_y.values]
    df.drop(['title_x','title_y','description_x','description_y'],axis=1,inplace=True)
    return(df)

In [7]:
train = missing_values(train)
test = missing_values(test)

In [8]:
train = feature_add(train)
test = feature_add(test)

In [108]:
train.head(2)

Unnamed: 0,itemID_1,itemID_2,generationMethod,categoryID_x,locationID_x,lat_x,lon_x,categoryID_y,locationID_y,lat_y,lon_y,title_len_x,title_len_y,description_len_x,description_len_y
0,1,4112648,1,81,648140,64.686946,30.815924,81,648140,64.686946,30.815924,28,28,7,7
1,3,1991275,1,14,639040,55.678037,37.256548,14,639040,55.678037,37.256548,9,9,7,7


### Training random forest on train dataset

In [41]:
rfc=RandomForestClassifier(n_estimators=1000)

In [10]:
y = train.isDuplicate
train.drop('isDuplicate',axis=1,inplace=True)

In [11]:
rfc.fit(train,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
rfc.feature_importances_

array([ 0.07861288,  0.0864356 ,  0.18976509,  0.0588853 ,  0.02623861,
        0.04123123,  0.03872553,  0.05885876,  0.02667046,  0.04126645,
        0.03927381,  0.05634325,  0.09657453,  0.05672616,  0.10439235])

In [13]:
rfc.classes_

array([0, 1], dtype=int64)

In [139]:
preds = rfc.predict_proba(test)[:,1]

In [151]:
submit = pd.DataFrame({'id':item_pairs_test.id.values,'probability':preds})

In [152]:
submit.to_csv('submit_28th_02.csv',index=None, header=True)

### Generating some more features

In [14]:
item_info_train = pd.read_csv('ItemInfo_train.csv/ItemInfo_train.csv')
item_pairs_train = pd.read_csv('ItemPairs_train.csv/ItemPairs_train.csv')
item_info_test = pd.read_csv('ItemInfo_test.csv/ItemInfo_test.csv')
item_pairs_test  = pd.read_csv('ItemPairs_test.csv/ItemPairs_test.csv')
category = pd.read_csv('Category.csv/Category.csv')
location  = pd.read_csv('Location.csv/Location.csv')

In [15]:
train = prepare_train_test(item_info_train,item_pairs_train)
test = prepare_train_test(item_info_test,item_pairs_test)

In [17]:
train.drop(['generationMethod','images_array_x', 'attrsJSON_x','images_array_y', 'attrsJSON_y'], axis=1,inplace=True)

In [19]:
test.drop(['id', 'images_array_x','attrsJSON_x', 'images_array_y','attrsJSON_y'],axis=1,inplace=True)

In [23]:
train['latitude_same'] = (train.lat_x == train.lat_y).astype(int)
train['longitude_same'] = (train.lon_x == train.lon_y).astype(int)
train['location_same'] = (train.locationID_x == train.locationID_y).astype(int)
train['price_minus'] = (train.price_x - train.price_y)
train['title_same'] = (train.title_x == train.title_y).astype(int)
train['description_same'] = (train.description_x == train.description_y).astype(int)
test['latitude_same'] = (test.lat_x == test.lat_y).astype(int)
test['longitude_same'] = (test.lon_x == test.lon_y).astype(int)
test['location_same'] = (test.locationID_x == test.locationID_y).astype(int)
test['price_minus'] = (test.price_x - test.price_y)
test['title_same'] = (test.title_x == test.title_y).astype(int)
test['description_same'] = (test.description_x == test.description_y).astype(int)

test.fillna(value=-999,inplace=True)
train.fillna(value=-999,inplace=True)

Unnamed: 0,itemID_1,itemID_2,isDuplicate,categoryID_x,title_x,description_x,price_x,locationID_x,lat_x,lon_x,...,price_y,locationID_y,lat_y,lon_y,latitude_same,longitude_same,location_same,price_minus,title_same,description_same
0,1,4112648,1,81,Продам Камаз 6520,Продам Камаз 6520 20 тонн,300000.0,648140,64.686946,30.815924,...,300000.0,648140,64.686946,30.815924,1,1,1,0.0,1,1
1,3,1991275,1,14,Yamaha r6,Весь в тюнинге.,300000.0,639040,55.678037,37.256548,...,330000.0,639040,55.678037,37.256548,1,1,1,-30000.0,0,0
2,4,1223296,0,84,iPhone 3gs 8gb,"Телефон в хорошем состоянии, трещин и сколов н...",3500.0,640650,56.239398,43.460458,...,3500.0,640650,56.239398,43.460458,1,1,1,0.0,0,0
3,7,1058851,1,84,Xiaomi Mi4 3гб RAM + 16гб ROM белый,"Отличный подарок на новый год от ""китайской ap...",13500.0,662210,55.777170,37.586194,...,13500.0,662210,56.135459,47.235484,0,0,1,0.0,0,0
4,8,2161930,1,39,Лыжные ботинки,"Лыжные ботинки в хорошем состоянии, 34 размер",500.0,624360,55.777170,37.586194,...,600.0,624360,55.777170,37.586194,1,1,1,-100.0,0,0
5,9,694103,1,39,Сноуборд ботинки Nitro Team 10 us,"сноубордические ботинки Nitro Team\nразмер 42,...",7000.0,644200,58.004785,56.237654,...,7000.0,644200,58.004785,56.237654,1,1,1,0.0,0,0
6,12,5637025,0,9,"LADA Priora, 2015",Машина новая пробег реальный. Не битая не краш...,445000.0,631060,44.219841,42.058825,...,450000.0,631060,44.219841,42.058825,1,1,1,-5000.0,1,0
7,401724,5637025,1,9,"LADA Priora, 2015","Машина как с завода .( не битая , не крашеная,...",470000.0,631060,44.219841,42.058825,...,450000.0,631060,44.219841,42.058825,1,1,1,20000.0,1,0
8,5279740,5637025,1,9,"LADA Priora, 2015",Машина в идеальном состояний. Не битая не краш...,455000.0,631060,44.219841,42.058825,...,450000.0,631060,44.219841,42.058825,1,1,1,5000.0,1,0
9,12,5279740,0,9,"LADA Priora, 2015",Машина новая пробег реальный. Не битая не краш...,445000.0,631060,44.219841,42.058825,...,455000.0,631060,44.219841,42.058825,1,1,1,-10000.0,1,0


In [24]:
features = ['latitude_same', 'longitude_same', 'location_same', 'price_minus', 'title_same', 'description_same']

In [42]:
rfc.fit(train[features],train.isDuplicate.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [43]:
rfc.feature_importances_

array([ 0.02424023,  0.01915796,  0.01352403,  0.62765799,  0.29355183,
        0.02186796])

In [34]:
rfc.classes_

array([0, 1], dtype=int64)

In [44]:
preds_100=rfc.predict_proba(test[features])[:,1]

In [46]:
submit = pd.DataFrame({'id':item_pairs_test.id.values,'probability':preds_100})

In [47]:
submit.to_csv('submit_28_04.csv',index=False,header=True)

### Applying GBM 

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

In [49]:
gbm = GradientBoostingClassifier(n_estimators=1000)

In [None]:
gbm.fit(train[features], train.isDuplicate.values)

### Creating new feature from json column using key-value comparison

In [2]:
item_info_train = pd.read_csv('ItemInfo_train.csv/ItemInfo_train.csv')
item_pairs_train = pd.read_csv('ItemPairs_train.csv/ItemPairs_train.csv')
item_info_test = pd.read_csv('ItemInfo_test.csv/ItemInfo_test.csv')
item_pairs_test  = pd.read_csv('ItemPairs_test.csv/ItemPairs_test.csv')
category = pd.read_csv('Category.csv/Category.csv')
location  = pd.read_csv('Location.csv/Location.csv')

In [5]:
train = prepare_train_test(item_info_train,item_pairs_train)
test = prepare_train_test(item_info_test,item_pairs_test)