# Entity Resolution Project

@author: ty2326, lz2459, and ql2257

### Import Packages

In [None]:
%%bash
pip install editdistance
pip install pygeocoder


In [None]:
import re
import json
import numpy as np
import pandas as pd
import editdistance
import random
from collections import Counter
from pygeocoder import Geocoder
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Read files into python

In [None]:
PATH = "Prakhar/er-assignment/fs/Instabase%20Drive/files/datasets/"
FILES = {
    "foursquare_test": "foursquare_test_hard.json",
    "locu_test": "locu_test_hard.json",
    "foursquare_train": "foursquare_train_hard.json",
    "locu_train": "locu_train_hard.json"
}

foursquare_test = pd.read_json(ib.open(PATH + FILES["foursquare_test"]))
locu_test = pd.read_json(ib.open(PATH + FILES["locu_test"]))
foursquare_train = pd.read_json(ib.open(PATH + FILES["foursquare_train"]))
locu_train = pd.read_json(ib.open(PATH + FILES["locu_train"]))

## Preprocessing


After briefly checking the components of the training data, we found out that,

### Address and Zipcode Data Cleaning and Formalization Using Google API

*  The street addresses for each venue are not in same format, for example, some addresses are representing "West" with W. , while others are using "West". Therefore, we import 'Google Maps Geocoding API' to reformat all the existing street addresses for all four data sets, and update postal code based on them, in case there is any wrongly recorded postal code.

In [None]:
my_geocoder = Geocoder(api_key='AIzaSyCh5YB2E2dvUn57l5Xc2zZ0mjnul8VK4uw')
# other google api key: AIzaSyAocsDNe7TnZppttVeMcDtdx3fTBOL0Z-A
for i in locu_train.index:
    street = locu_train.ix[i]['street_address'] + ', NY ' + locu_train.ix[i]['postal_code']
    result = my_geocoder.geocode(street)
    locu_train.ix[i, 'street_address'] = result.formatted_address.split(',')[0].split('#')[0].strip(' ')
    if locu_train.ix[i]['postal_code'] == '':
        locu_train.ix[i, 'postal_code'] = result.postal_code

for i in locu_test.index:
    street = locu_test.ix[i]['street_address'] + ', NY ' + locu_test.ix[i]['postal_code']
    result = my_geocoder.geocode(street)
    locu_test.ix[i, 'street_address'] = result.formatted_address.split(',')[0].split('#')[0].strip(' ')
    if locu_test.ix[i]['postal_code'] == '':
        locu_test.ix[i, 'postal_code'] = result.postal_code

for i in foursquare_train.index:
    street = foursquare_train.ix[i]['street_address'] + ', NY ' + foursquare_train.ix[i]['postal_code']
    result = my_geocoder.geocode(street)
    foursquare_train.ix[i, 'street_address'] = result.formatted_address.split(',')[0].split('#')[0].strip(' ')
    if foursquare_train.ix[i]['postal_code'] == '':
        foursquare_train.ix[i, 'postal_code'] = result.postal_code

for i in foursquare_test.index:
    street = foursquare_test.ix[i]['street_address'] + ', NY ' + foursquare_test.ix[i]['postal_code']
    result = my_geocoder.geocode(street)
    foursquare_test.ix[i, 'street_address'] = result.formatted_address.split(',')[0].split('#')[0].strip(' ')
    if foursquare_test.ix[i]['postal_code'] == '':
        foursquare_test.ix[i, 'postal_code'] = result.postal_code

### Fill Missing Values for Postal Code and Geo Data

*   There are some venues that do not contain street address and postal code. For convenience in later analysis, we chose the top 10 common postal code among every data set, and calculate the percentage amount of them.
*  There is one pair of longitude and latitude in the training data sets is missing. We decided to give them a fake pair of longitude and latitude, which is about the average of all longitude and latitude respectively.

In [None]:
code_counter_fs = Counter(foursquare_train['postal_code']).most_common(10)
code_cn_fs = [t[1] for t in code_counter_fs][1:]
code_pct_fs = [i/float(sum(code_cn_fs))for i in code_cn_fs]
pos_code_fs = [t[0] for t in code_counter_fs][1:]

locu_train['longitude'].fillna(-73.9600)
locu_train['latitude'].fillna(-40.7300)

code_counter_lc = Counter(locu_train['postal_code']).most_common(10)
code_cn_lc = [t[1] for t in code_counter_lc][1:]
code_pct_lc = [i/float(sum(code_cn_lc))for i in code_cn_lc]
pos_code_lc = [t[0] for t in code_counter_lc][1:]

code_counter_fs_ts = Counter(foursquare_test['postal_code']).most_common(10)
code_cn_fs_ts = [t[1] for t in code_counter_fs_ts][1:]
code_pct_fs_ts = [i/float(sum(code_cn_fs_ts))for i in code_cn_fs_ts]
pos_code_fs_ts = [t[0] for t in code_counter_fs_ts][1:]

code_counter_lc_ts = Counter(locu_test['postal_code']).most_common(10)
code_cn_lc_ts = [t[1] for t in code_counter_lc_ts][1:]
code_pct_lc_ts = [i/float(sum(code_cn_lc_ts))for i in code_cn_lc_ts]
pos_code_lc_ts = [t[0] for t in code_counter_lc_ts][1:]


### Website Data Cleaning

Then, we generalize all website forms by deleting "http://", "www", and "com" in all four data sets.

In [None]:
for i in foursquare_train.index:
    if foursquare_train.ix[i]['website'] != "":
        item = foursquare_train.ix[i]['website']
        item = item.replace("http://", "")
        item = item.replace("www.", "")
        item = item.rstrip(".com")
        item = item.rstrip(".com/")
        item = item.replace("https://", "")
        foursquare_train.set_value(i, 'website', item)

for i in locu_train.index:
    if locu_train.ix[i]['website'] != "":
        item = locu_train.ix[i]['website']
        item = item.replace("http://", "")
        item = item.replace("www.", "")
        item = item.rstrip(".com")
        item = item.rstrip(".com/")
        item = item.replace("https://", "")
        locu_train.set_value(i, 'website', item)

for i in foursquare_test.index:
    if foursquare_test.ix[i]['website'] != "":
        item = foursquare_test.ix[i]['website']
        item = item.replace("http://", "")
        item = item.replace("www.", "")
        item = item.rstrip(".com")
        item = item.rstrip(".com/")
        item = item.replace("https://", "")
        foursquare_test.set_value(i, 'website', item)

for i in locu_test.index:
    if locu_test.ix[i]['website'] != "":
        item = locu_test.ix[i]['website']
        item = item.replace("http://", "")
        item = item.replace("www.", "")
        item = item.rstrip(".com")
        item = item.rstrip(".com/")
        item = item.replace("https://", "")
        locu_test.set_value(i, 'website', item)

### Phone Number Data Formalization and Filling Missing Values

We then check through all venues in all data sets, and,
*   Besides the street addresses, the phone number in each venue are not in same format either. Therefore, we use regx to change them into the same format. 
* For those with phone number but in irrgular format, we use the regx above to change them into regular form
* For those without phone number, we give each of them a fake phone number starting with "212"(the most common start in both training data sets) and followed by a 7-digit random number
* For those without postal number, we give each of them a fake postal number which is chosen with probability(the percentage we calculated above) from the top 10 common postal codes in each data set independently.

In [None]:
re_parser = re.compile('\(([^ ]*)\) ([^ ]*)\-([^ ]*)')

for i in foursquare_test.index:
    fs_ph = foursquare_test.ix[i]['phone']
    fs_code = foursquare_test.ix[i]['postal_code']
    if fs_ph is not None:
        phone_parser = re_parser.match(fs_ph)
        if phone_parser is not None:
            foursquare_test.set_value(i, 'phone', ''.join(phone_parser.groups()))
    else:
        foursquare_test.set_value(i, 'phone', '212' + str(random.randrange(1000000, 9999999)))

    if fs_code is None:
        foursquare_test.set_value(i, 'postal_code', np.random.choice(pos_code_fs_ts, 1, p=code_pct_fs_ts)[0])

for i in locu_test.index:
    lc_ph = locu_test.ix[i]['phone']
    lc_code = locu_test.ix[i]['postal_code']
    if lc_ph is None:
        locu_test.set_value(i, 'phone', '212' + str(random.randrange(1000000, 9999999)))

    if lc_code is None:
        locu_test.set_value(i, 'postal_code', np.random.choice(pos_code_lc_ts, 1, p=code_pct_lc_ts)[0])
        

for i in foursquare_train.index:
    fs_ph = foursquare_train.ix[i]['phone']
    fs_code = foursquare_train.ix[i]['postal_code']
    if fs_ph is not None:
        phone_parser = re_parser.match(fs_ph)
        if phone_parser is not None:
            foursquare_train.set_value(i, 'phone', ''.join(phone_parser.groups()))
    else:
        foursquare_train.set_value(i, 'phone', '212' + str(random.randrange(1000000, 9999999)))

    if fs_code is None:
        foursquare_train.set_value(i, 'postal_code', np.random.choice(pos_code_fs, 1, p=code_pct_fs)[0])

for i in locu_train.index:
    lc_ph = locu_train.ix[i]['phone']
    lc_code = locu_train.ix[i]['postal_code']
    if lc_ph is None:
        locu_train.set_value(i, 'phone', '212' + str(random.randrange(1000000, 9999999)))

    if lc_code is None:
        locu_train.set_value(i, 'postal_code', np.random.choice(pos_code_lc, 1, p=code_pct_lc)[0])  


### Save and Export Cleaned Datasets

Finally, we export all preprocessed data sets into four csv files "locu_train.csv", "locu_test.csv", "foursquare_train.csv" and "foursquare_test.csv" for later use. (It will take a while to generate all four csv files, you can use the files that we generated to check our later work.)

In [None]:
username = "Qitong"
repo = "entity-resolution"

f1 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/locu_train.csv'.format(username,repo))
locu_train.to_csv(f1, encoding='utf-8')
f1.close()

f2 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/locu_test.csv'.format(username,repo))
locu_test.to_csv(f2, encoding='utf-8')
f2.close()

f3 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/foursquare_train.csv'.format(username,repo))
foursquare_train.to_csv(f3, encoding='utf-8')
f3.close()

f4 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/foursquare_test.csv'.format(username,repo))
foursquare_test.to_csv(f4, encoding='utf-8')
f4.close()









## Feature Creation and Model Input Selection

### Read the Cleaned Datasets

In [None]:
PATH = "Qitong/entity-resolution/fs/Instabase%20Drive/final/"
FILES = {
    "foursquare_train": "foursquare_train.csv",
    "foursquare_test": "foursquare_test.csv",
    "locu_train": "locu_train.csv",
    "locu_test": "locu_test.csv",
    "matches": "matches_train_hard.csv"
}

locu_train = pd.read_csv(ib.open(PATH + FILES['locu_train']))
locu_test = pd.read_csv(ib.open(PATH + FILES['locu_test']))
foursquare_train = pd.read_csv(ib.open(PATH + FILES['foursquare_train']))
foursquare_test = pd.read_csv(ib.open(PATH + FILES['foursquare_test']))

matches = pd.read_csv(ib.open(PATH + FILES['matches']))

### Get the Matched id Pairs

In order to locate the id in each matched pair in the training data sets easier, we combine each id in the mactched pairs and the index of its venue in the two training data sets respectively.

In [None]:
matched_pairs = zip(list(pd.match(matches['foursquare_id'], foursquare_train['id'])),
                    list(pd.match(matches['locu_id'], locu_train['id'])))

Since the existing features are hard to compare with each other, we decide to use them to create some new features to make the classification problem simpler. 
Additionaly, due to the large number amount of data and we do not want our model to learn every single pair of the samples and test every pair later, we decide to select a relatively small amount of pairs to train the model, and also ignore  pairs that are very unlikely to be a match for testing.

### Generate Testing Dataset

We first start to create the training data frame that we will put into our model later. 
For all the training samples we have, we decide that, for each pair of venues from locu_train and foursquare_train, we create the following seven features. They are, 
1. the editdistance between names
2. the editdistance bewteen the string of phone numbers
3. the editdistance between the string of street addresses (all letters are in lower case) 
4. the editdistance between the string of websites if both of them exist (if not, we set the distance equals to 30)
5. the absolute value of difference between longitudes
6. the absolute value of difference between latitudes
7. the absolute value of differnece between postal codes

After creating the seven features each time, we start to decide whether we want to put this pair into our consideration or not.

Here, we will not consider the pair as a possible match if none of the following condition is satisified. They are,
* the editdistance between names is greater than 10
* the editdistance between phone numbers is greater than 3
* the absolute value of difference between longitudes is greater than 0.0001 and that between latitudes is greater than 0.001
* the editdistance between websites is greater than 10
* the editdistance between street addresses is greater than 8

Otherwise, we will add the list of its seven features along with the index pair of the two venues into our result_test data frame for later classification task.

In [None]:
result_test = pd.DataFrame(columns=['name_dis', 'phone_dis', 'geo_log_dis', 'geo_lat_dis', 'poscode_dis', 'add_dis', 'web_dis', 'index_pair'])

for i in foursquare_test.index:
    for j in locu_test.index:
        cur_fs = foursquare_test.ix[i]
        cur_lc = locu_test.ix[j]
        name_dis = editdistance.eval(cur_fs['name'], cur_lc['name'])
        phone_dis = editdistance.eval(str(cur_fs['phone']), str(cur_lc['phone']))
        add_dis = editdistance.eval(str(cur_fs['street_address']).lower(), str(cur_lc['street_address']).lower())

        if cur_fs['website'] and cur_lc['website']:
            web_dis = editdistance.eval(str(cur_fs['website']), str(cur_lc['website']))
        else:
            web_dis = 30

        geo_log_dis = abs(cur_lc['longitude'] - cur_fs['longitude'])
        geo_lat_dis = abs(cur_lc['latitude'] - cur_fs['latitude'])

        poscode_dis = abs(int(cur_fs['postal_code']) - int(cur_lc['postal_code']))

        #to be tuned
        if name_dis <= 10 or phone_dis <= 3 or (geo_log_dis <= 0.0001 and geo_lat_dis<= 0.001) or web_dis <= 10 or add_dis <= 8:
            index_pair = (i, j)
            feature_list = [name_dis, phone_dis, geo_log_dis, geo_lat_dis, poscode_dis, add_dis, web_dis, index_pair]
            result_test.loc[len(result_test.index)] = feature_list
            

username = "Qitong"
repo = "entity-resolution"

f2 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/test1.csv'.format(username,repo))
result_test.to_csv(f2)
f2.close()

### Generate Training Dataset

First, we check if this pair is a match or not by checking the index pair with the "matched_pairs" list we made previously. 
* If it is a match, we add the list of all seven features along with a label "1" into the result_train data frame.
* If it is not a match, but the editdistance of names is not greater than 3 and the editdistance of phone number is not greater than 2, we also add the features and a label "1" into the result_train data frame.

If neither of the above two conditions is satisfied, we then check the following three conditions,
* The editdistance between names is not greater than 7
* The editdistance between phone numbers is not greater than 3
* The absolute value of difference between longitudes is not greater than 0.0001 and that between latitudes is not greater than 0.001

If the pair satisfies any one of the three conditions, we add its seven features along with a label "0" into the result_train data frame, to indicate that even they are similar enough in some features, they are not a match.

### PLEASE DO NOT RUN THE FOLLOWING CODE TO GENERATE ANOTHER TRAINING DATASET!!

### here we comment the code below because we tuned the paramaters in RandomForest Model based on the sepecific train dataset in next step. This should not be consider as a unreasonable request. If you insist, pls contact us.

(In the step of data cleaning, we semi-randomly generated zip and phone numbers to replace NAs in the original dataset)

In [None]:
# result_train = pd.DataFrame(columns=['name_dis', 'phone_dis', 'geo_log_dis', 'geo_lat_dis', 'poscode_dis', 'add_dis', 'web_dis', 'label'])

# for i in foursquare_train.index:
#     for j in locu_train.index:
#         cur_fs = foursquare_train.ix[i]
#         cur_lc = locu_train.ix[j]
#         name_dis = editdistance.eval(cur_fs['name'], cur_lc['name'])
#         phone_dis = editdistance.eval(str(cur_fs['phone']), str(cur_lc['phone']))
#         add_dis = editdistance.eval(str(cur_fs['street_address']).lower(), str(cur_lc['street_address']).lower())

#         if cur_fs['website'] and cur_lc['website']:
#             web_dis = editdistance.eval(str(cur_fs['website']), str(cur_lc['website']))
#         else:
#             web_dis = 30

#         geo_log_dis = abs(cur_lc['longitude'] - cur_fs['longitude'])
#         geo_lat_dis = abs(cur_lc['latitude'] - cur_fs['latitude'])

#         poscode_dis = abs(int(cur_fs['postal_code']) - int(cur_lc['postal_code']))

#         #to be tuned 
#         if (i, j) in matched_pairs or (name_dis<=3 and phone_dis<=2):
#             feature_list = [name_dis, phone_dis, geo_log_dis, geo_lat_dis, poscode_dis, add_dis, web_dis, 1]
#             result_train.loc[len(result_train.index)] = feature_list
#         elif name_dis <= 7 or phone_dis <= 3 or (geo_log_dis <= 0.0001 and geo_lat_dis<= 0.001):
#             feature_list = [name_dis, phone_dis, geo_log_dis, geo_lat_dis, poscode_dis, add_dis, web_dis, 0]
#             result_train.loc[len(result_train.index)] = feature_list

            
# username = "Qitong"
# repo = "entity-resolution"

# f1 = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/result_should_not_be_created.csv'.format(username,repo))
# result_train.to_csv(f1)
# f1.close()


## Run RandomForest Classifier

### Read Final Training and Testing Datasets 

In [None]:
PATH = "Qitong/entity-resolution/fs/Instabase%20Drive/final/"
FILES = {
    "result": "result.csv",
    "test": "test.csv",
    "foursquare_test": "foursquare_test.csv",
    "locu_test": "locu_test.csv"
}

train = pd.read_csv(ib.open(PATH + FILES['result']))
test = pd.read_csv(ib.open(PATH + FILES['test']))
locu_test = pd.read_csv(ib.open(PATH + FILES['locu_test']))
foursquare_test = pd.read_csv(ib.open(PATH + FILES['foursquare_test']))

We found that there is one venue in the test data that has NA in longitute difference and latitute difference, therefore we randomly set a pair of values to it to avoid the occurance of error while doing prediction.

In [None]:
test.fillna({'geo_log_dis': 0.003, 'geo_lat_dis': 0.02}, inplace=True)
train = train.dropna(how='any')

### Feature Selection

In [None]:
features = train.columns[[1,2,3,4,5,6,7]]
y = train['label']

### Run Cross Validation to Find the Best Parameters for the Model Based on THE SEPECIFIC TRAINING DATASET!

##### Since it takes a long time, please feel free to skip this part when checking our work

In [None]:
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(
#     train[features], y, test_size=0.4, random_state=0)

# tuned_parameters = [{'n_estimators': [40, 80, 110, 130], 'max_depth': [6, 8, 10, 15],
#                      'max_features': [4,6,7]}]


# scores = ['precision', 'recall']

# for score in scores:
#     print("# Tuning hyper-parameters for %s" % score)
#     print()

#     clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,
#                        scoring='%s_weighted' % score)
#     clf.fit(X_train, y_train)

#     print("Best parameters set found on development set:")
#     print()
#     print(clf.best_params_)
#     print()
#     print("Grid scores on development set:")
#     print()
#     for params, mean_score, scores in clf.grid_scores_:
#         print("%0.3f (+/-%0.03f) for %r"
#               % (mean_score, scores.std() * 2, params))

#     print("Detailed classification report:")
#     print()
#     print("The model is trained on the full development set.")
#     print("The scores are computed on the full evaluation set.")
#     print()
#     y_true, y_pred = y_test, clf.predict(X_test)
#     print(classification_report(y_true, y_pred))

### Fitting the Model and Find the Most Important Feature

In [None]:
clf = RandomForestClassifier(n_estimators= 130, max_depth=7)
clf.fit(train[features], y)

print clf.feature_importances_


## Prediction Using Testing Datasets

In order to accomplish our pair match task, we need to first, find the predicted class probabilities of each test input, and based on the probabilities, to decide which pairs are possibily labeled as "1" i.e. a match, and which are definitely not; then, for those having the possibility to become an actual match, if there are some venues have more than one match pair, we need to decide which one to pair with.

### Getting Predicted Class Probabilities

In [None]:
preds_test_prob = clf.predict_proba(test[features])

### Finding Match Pairs

#### Find the Match Pairs with High Predicted Probability

First of all, we initialize the maximum probability to be 0.78, the predicted index to be None, and the index of the first venue in the pair we are checking to be 0. Then, we iterate through all possible pairs to see if the class probability is no less than the maximum probability or not.
* If yes, we then update the maximum probability by the class probability, and update the predicted index by the index  of the second venue in its original data set.
* If not, we pass to the next pair directly.

When the index of the first venue in this pair does not equal to the index we initialized, all (0, x) pairs are checked. At this time, the value saved under "pred_ind" is the index of the venue that is the most possible match with the first venue in the first data set.

Now, we can add this pair of indices into our prediction list, and record both indices as 'paired' by adding them into paired_index lists separately. 
Then, we increase the index variable by one, set the prediced index varialbe and maximum probability variable back to their initial value, and finally, continue our iteration.


In [None]:
ind = 0
max_prob = 0.78
pred_ind = None
pred_list = []
paired_id0 = []
paired_id1 = []

for i in xrange(len(preds_test_prob)):
    index_list = test.ix[i]['index_pair'].strip('(').strip(')').split(', ')
    if int(index_list[0]) == ind:
        if preds_test_prob[i][1] >= max_prob:
            max_prob =  preds_test_prob[i][1]
            pred_ind = int(index_list[1])
    else:
        if pred_ind is not None:
            pred_list.append((ind, pred_ind))
            paired_id0.append(ind)
            paired_id1.append(pred_ind)
        ind += 1
        max_prob = 0.78
        pred_ind = None
        if preds_test_prob[i][1] >= max_prob:
            max_prob = preds_test_prob[i][1]
            pred_ind = int(index_list[1])

After the iteration process, we are pretty sure that the pairs stored in "pred_list" are the most possible matches in both data sets, therefore, we can now add their ids into our final match list.

In [None]:
matches_test = []
for p in pred_list:
    matches_test.append([locu_test.ix[p[1]]['id'], foursquare_test.ix[p[0]]['id']])

In order to avoid the situation that one id appears more than once in the final match list, we store all found ids for later comparison.

In [None]:
match_lc_ids = [match[0] for match in matches_test]
match_fs_ids = [match[1] for match in matches_test]

#### Find the Match Pairs with Midium Predicted Probability


So far we have found all pairs that our classifer predicts as a match with a relatively high probability. Now, we have to deal with pairs that do not have high predicted class probabilities and to find if there are any matches among them.

To do this, we first construct a 400x400 matrix of zeroes, and then update its elements if the predicted probability of the pairs that neither are included in the final match list, nor the two indices are recorded already.

In [None]:
preds_prob_matrix = np.zeros((400, 400))

for i in xrange(len(preds_test_prob)):
    index_list = test.ix[i]['index_pair'].strip('(').strip(')').split(', ')
    fs_ind = int(index_list[0])
    lc_ind = int(index_list[1])
    if fs_ind not in paired_id0 and lc_ind not in paired_id1:
        preds_prob_matrix[fs_ind, lc_ind] = preds_test_prob[i][1]

With the probability matrix, we first iterate through rows (each row represents a venue in "locu"), and for each row, we add its index as a key to dictonary "top_5_row". After finding the top 5 probabilities that are in the range of 0.5 and 0.78 in this row, we store their indices into the dictionary under the key we just added. 

In [None]:
top_5_row = {}
for index in xrange(preds_prob_matrix.shape[0]):
    arr = preds_prob_matrix[index,:]
    elem = []
    sorted_index = np.argsort(arr)[::-1]
    if sorted_index[0] == 0:
        top_5_row[index] = elem
        continue

    for j in sorted_index:
        if arr[j] < 0.78 and arr[j] > 0.5:
            elem.append(j)

    top_5_row[index] = elem

After processing through all rows of the matrix, we do the same thing for each of its columns. 

In [None]:
top_5_col = {}
for index in xrange(preds_prob_matrix.shape[1]):
    arr = preds_prob_matrix[:,index]
    elem = []
    sorted_index = np.argsort(arr)[::-1]
    if sorted_index[0] == 0:
        top_5_col[index] = elem
        continue

    for i in sorted_index:
        if arr[i] < 0.78 and arr[i] > 0.5:
            elem.append(i)

    top_5_col[index] = elem

Now, we have two dictionaries of probabilities for two test data sets respectively.

We then iterate through the "top_5_row" dictionary. For each key/value pair, for each index in the value-list, we look up its corresponding index along with its value-list in the "top_5_col" dictionary, if the index is also in the value-list, we say this pair of indices is a possible match and store their corresponding ids as "match_id". 

If this pair of ids is not in the final match list, to avoid repetition, we check if the second index has already appeared in a possible pair or not. Here, if it is the first time we see this index, we add the id pair into the check dictionary, "matches_sec", under the indices pair key. If it has already appeared before, we compare its predicted class probability with the existing pair's, and update the pair if its probability is higher.

In [None]:
matches_sec = {}
for i, top_vals in top_5_row.items():
    for ind_i, j in enumerate(top_vals):
        top_vals_j = top_5_col[j]
        if i in top_vals_j:
            match_id = [locu_test.ix[j]['id'], foursquare_test.ix[i]['id']]
            if match_id not in matches_test:
                sec_keys = [key[1] for key in matches_sec.keys()]
                if j not in sec_keys:
                    matches_sec[(i,j)] = match_id
                else:
                    key_j = [key for key in matches_sec.keys() if key[1] == j]
                    pre_i = key_j[0][0]
                    if preds_prob_matrix[pre_i, j] < preds_prob_matrix[i, j]:
                        del matches_sec[(pre_i, j)]
                        matches_sec[(i,j)] = match_id

After doing all processes above, we have a dictionary with pairs of indices as keys and their corresponding pairs of ids as values.

Before finally adding the pair into our final match list, we check again to make sure that both indices in the pair are not in our match list already. After checking, we add those satisfied pairs into our final match list.

In [None]:
for match in matches_sec.values():
    if match[0] not in match_lc_ids and match[1] not in match_fs_ids:
        matches_test.append(match)

### Check the Final Missing Pairs

#### Can skip this step

In [None]:
match_lc_ids_f = [match[0] for match in matches_test]
match_fs_ids_f = [match[1] for match in matches_test]

add_len = 0

for i in xrange(len(test)):
    cur_name_dis = test.ix[i]['name_dis']
    cur_ph_dis = test.ix[i]['phone_dis']
    cur_web_dis = test.ix[i]['web_dis']
    cur_geo_dis = test.ix[i]['geo_log_dis'] + test.ix[i]['geo_lat_dis']
    if (cur_name_dis <= 4 and cur_geo_dis <= 0.0005) or cur_ph_dis <= 1 or (cur_web_dis != 0 and cur_web_dis <= 2):
        index_list = test.ix[i]['index_pair'].strip('(').strip(')').split(', ')
        fs_ind = int(index_list[0])
        lc_ind = int(index_list[1])
        add_id_pair = [locu_test.ix[lc_ind]['id'], foursquare_test.ix[fs_ind]['id']]
        if add_id_pair[0] not in match_lc_ids_f and add_id_pair[1] not in match_fs_ids_f:
            add_len += 1
            matches_test.append(add_id_pair)

### Output Matches into csv File

In [None]:
username = "Qitong"
repo = "entity-resolution"

f = ib.open('/{0}/{1}/fs/Instabase%20Drive/final/matches_test.csv'.format(username,repo))
f.write('locu_id, foursquare_id\n')

for match in matches_test:
    f.write(','.join(match) + '\n')
f.close()