In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import json

matches_df = pd.read_csv('train/matches_train.csv')
foursq_df = pd.read_json('train/foursquare_train.json')
locu_df = pd.read_json('train/locu_train.json')

In [28]:
def crossjoin(df1, df2, **kwargs):
    # Cross join the two org's data to form a dataframe of all possible pairs
    df1['_tmp'] = 1
    df2['_tmp'] = 1
    joined = pd.merge(foursq_df, locu_df, on=['_tmp'], **kwargs).drop('_tmp', axis=1)
    return joined

train_df = crossjoin(foursq_df, locu_df, suffixes=['_foursq', '_locu'])

matches_df.columns = ['id_locu', 'id_foursq']
matches_df['matched'] = 1
train_df = train_df.merge(matches_df, how='left', left_on=['id_locu', 'id_foursq'], right_on=['id_locu', 'id_foursq'])
train_df['matched'].fillna(0, inplace=True)

Unnamed: 0,country_foursq,id_foursq,latitude_foursq,locality_foursq,longitude_foursq,name_foursq,phone_foursq,postal_code_foursq,region_foursq,street_address_foursq,...,latitude_locu,locality_locu,longitude_locu,name_locu,phone_locu,postal_code_locu,region_locu,street_address_locu,website_locu,matched
0,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.739822,New York,-73.985144,Chipotle Mexican Grill,2126736904,10010,NY,125 East 23rd St.,http://www.chipotle.com/,
1,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.810765,New York,-73.952591,Honey Salon Inc,2126630100,10026,NY,174 Saint Nicholas Ave.,,
2,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.743580,New York,-73.986127,Palatte,6464763812,10016,NY,66 Madison Ave.,http://www.palattenyc.com/,
3,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.828820,New York,-73.949022,Best Taste Restaurant,2122815691,10031,NY,3609 Broadway,,
4,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.749936,New York,-73.983849,Integra Hair System Inc,2125636786,10018,NY,11 W. 36th St. # 3,http://www.integrahair.com/,
5,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.788494,New York,-73.943786,Halal JFK Fried Chicken Pizza,2123692514,10029,NY,1998 2nd Ave.,http://halaljfkchickenpizza.com/,
6,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.815148,New York,-73.939791,Hong Kong Foid,2129265689,10037,NY,527-09 Lenox Ave.,,
7,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.797416,New York,-73.937459,AAA Laundry,6466720202,10035,NY,2272 2nd Ave.,,
8,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.722915,New York,-73.998114,Whiskey Friday,,10012,NY,Spring St. & Crosby St.,,
9,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,...,40.755041,New York,-73.977646,Stauback Co,2127105250,10002,NY,366 Madison Ave.,,


In [85]:
def prec_rec_f1(feature, df, mask):
    '''
    Utility function for evaluating the precision/recall/f1 of a
    feature matching function.
    Args:
        - feature - function that takes the dataframe as an input and
                    returns a numpy array of 1s and 0s binary labels
        - df      - the dataframe to evaluate the feature function on
        - mask    - the indices for positive labels
    Returns:
        - prec    - precision
        - rec     - recall
        - f1      - f1-score
    '''
    preds = feature(df)
    
    pos = preds.loc[mask]
    pos_preds = mask.loc[preds]
    prec = np.mean(pos_preds)
    rec = np.mean(pos)
    f1 = 2 / (1 / prec + 1 / rec)
    return prec, rec, f1

In [172]:
import re

y = train_df['matched'] == 1

stop_words = ['restaurant', 'inc', 'cafe', 'bakery', 'and', 'the', 'of']

def remove_words(s, words):
    return ' '.join([token for token in s.split() if s not in words])

def remove_punc(s):
    return re.sub('[.,&#\'()]', '', s)

def norm_name(name, default=''):
    # Normalize a location's name to get matches
    if not name:
        return default
    name = str(name).lower()
    name = remove_punc(name)
    name = remove_words(name, stop_words)
    return name

def unigrams(name):
    return norm_name(name).split()

def lat_long_match(df):
    # Latitude and longitudes differ in decimal places for foursquare and locu
    # Errors in coordinates and misisng values also reduce recall
    def norm_coord(coord):
        return round(coord, 4)
    return (df['latitude_foursq'].apply(norm_coord) == df['latitude_locu'].apply(norm_coord)) \
         & (df['longitude_foursq'].apply(norm_coord) == df['longitude_locu'].apply(norm_coord))

def phone_num_match(df):
    def norm_foursq_phone(p):
        if not p:
            return ''
        p = str(p).replace('(', '')
        p = p.replace(')', '')
        p = p.replace(' ', '')
        p = p.replace('-', '')
        return p
    
    norm_phone = df['phone_foursq'].apply(norm_foursq_phone)
    return norm_phone == df['phone_locu'].apply(lambda p: str(p))

def address_match(df):
    def norm(a, default='None'):
        if not a:
            return default
        a = str(a).lower()
        a = a.replace('east', 'e')
        a = a.replace('west', 'w')
        a = a.replace('square', 'sq')
        a = a.replace('.', '')
        a = a.replace(',', '')
        return a
    
    return df['street_address_foursq'].apply(lambda a: norm(a, default='None_foursq')) \
        == df['street_address_locu'].apply(lambda a: norm(a, default='None_locu'))

def name_match(df):
    return df['name_foursq'].apply(lambda a: norm_name(a, default='None_foursq')) \
        == df['name_locu'].apply(lambda a: norm_name(a, default='None_locu'))

In [156]:
print(prec_rec_f1(lat_long_match, train_df, mask))
print(prec_rec_f1(phone_num_match, train_df, mask))
print(prec_rec_f1(address_match, train_df, mask))
print(prec_rec_f1(name_match, train_df, mask))

(0.9571428571428572, 0.5583333333333333, 0.7052631578947369)
(0.010081597933272424, 0.8888888888888888, 0.019937073611413975)
(0.9632107023411371, 0.8, 0.8740515933232169)
(0.7857142857142857, 0.7944444444444444, 0.7900552486187845)
(0.956268221574344, 0.9111111111111111, 0.9331436699857751)


In [174]:
def errors(df, y_true, y_pred, cols, type='false_pos'):
    # Utility function for viewing classification errors
    if type == 'false_pos':
        return df.loc[~y_true & y_pred][cols]
    elif type == 'false_neg':
        return df.loc[y_true & ~y_pred][cols]

errors(train_df, y, name_match(train_df), ['name_foursq', 'name_locu'], type='false_neg').head(30)

Unnamed: 0,name_foursq,name_locu
290,Chen Jin Diao Restaurant,Sally's West Indian Cuisine Restaurant
1683,Pearls: Chinese & Szechuan Cuisine,Pearls
18352,Via Brasil,Brasilla Restaurant
20478,Foo Sing Chinese Restaurant,Foo Sing Kitchen
25454,Littletown,Little Town
26254,Mamoun's Falafel,Mamoun's
35892,Dong Hai Grill,Dong Hai Restaurant
58315,71 Irving Place Coffee & Tea Bar,71 Irving Place
59166,Bocca Di Bacco,Bocca Di Bacco (Chelsea)
64917,Big Daddy's,Big Daddy's - Upper East Side


In [171]:
# Play around with approximate matching for names

def unigram_overlap(df):
    def overlap(row):
        grams1 = set(unigrams(row['name_foursq']))
        grams2 = set(unigrams(row['name_locu']))
        return len(grams1.intersection(grams2)) / float(min([len(grams1), len(grams2)]))
    return df.apply(overlap, axis=1)

overlaps = unigram_overlap(train_df)

def overlap_match(df):
    return overlaps > .5
    
def agg_feature(df):
    return lat_long_match(df) | address_match(df)

print(prec_rec_f1(overlap_match, train_df, mask))
print(prec_rec_f1(agg_feature, train_df, mask))

(0.7391304347826086, 0.8972222222222223, 0.8105395232120451)
(0.956268221574344, 0.9111111111111111, 0.9331436699857751)
