In [1]:
#using python 3
import json
import csv
import pandas as pd
import re
from nltk.metrics.distance import jaccard_distance
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline
import jellyfish

"""
This assignment can be done in groups of 3 students. Everyone must submit individually.

Write down the UNIs of your group (if applicable)

Name : Shih Hua Yu
Uni  : sy2734

Member 2: Chang Ding, cd2959

Member 3: Lisa Kim, lk2715
"""
def get_matches(locu_train_path, foursquare_train_path, matches_train_path, locu_test_path, foursquare_test_path):
    
    matching = pd.read_csv(matches_train_path)
    locu_train = pd.read_json(locu_train_path)
    foursquare_train = pd.read_json(foursquare_train_path)
    
    def preprocessing(X1, X2, matching):

        def normalization(X):
            def phonenumber(x):
                if x != None:
                    new_x = re.sub('[^A-Za-z0-9]+', '', x)
                    return new_x
            def restaurantname(x):
                if x != None:
                    new_x = x.lower()
                    new_x = re.sub('[^A-Za-z0-9 ]', '', new_x)
                    new_x = re.sub(' +', ' ', new_x)
                    return new_x
            def streetaddress(x):
                if x != None:
                    new_x = x.lower()
                    new_x = re.sub('[^A-Za-z0-9 ]', '', new_x)
                    return new_x

            X_new = X.copy()
            X_new['name'] = X_new['name'].apply(restaurantname)
            X_new['phone'] = X_new['phone'].apply(phonenumber)
            X_new['street_address'] = X_new['street_address'].apply(streetaddress)

            return X_new

        def computingdistance(X):
            import jellyfish
            X['name_jaccard_dis']=X.apply(lambda x: jaccard_distance(set(x['name_x'].split()),
                                                                                 set(x['name_y'].split())),axis=1)
            X['name_jaro_dis']=X.apply(lambda x: jellyfish.jaro_winkler(x['name_x'],x['name_y']),axis=1)
            X['phone_jaro_dis']=X.apply(lambda x: jellyfish.jaro_winkler(x['phone_x'],x['phone_y']),axis=1)
            X['poscd_jaro_dis']=X.apply(lambda x: jellyfish.jaro_winkler(x['postal_code_x'],x['postal_code_y']),axis=1)
            X['addr_jaro_dis']=X.apply(lambda x: jellyfish.jaro_winkler(x['street_address_x'],x['street_address_y']),axis=1)
            X['loc_jaro_dis']=X.apply(lambda x: jellyfish.jaro_winkler(x['locality_x'],x['locality_y']),axis=1)
            X['long_dis']=X.apply(lambda x: abs(x['longitude_x']-x['longitude_y']),axis=1)
            X['lat_dis']=X.apply(lambda x: abs(x['latitude_x']-x['latitude_y']),axis=1)

            return X

        X1 = normalization(X1)
        X2 = normalization(X2)

        # fill missing values with space or mean of the feature depending on variable type
        X1['longitude'] = X1['longitude'].fillna(value = X1['longitude'].mean())
        X1['latitude'] = X1['latitude'].fillna(value = X1['latitude'].mean())

        X2['longitude'] = X2['longitude'].fillna(value = X2['longitude'].mean())
        X2['latitude'] = X2['latitude'].fillna(value = X2['latitude'].mean())

        X1=X1.drop(columns=['country','website','region'])
        X2=X2.drop(columns=['country','website','region'])

        #cross join locu and foursquare
        X1['key']=1
        X2['key']=1
        cross_train = X1.merge(X2, on='key',how='outer')

        if matching is not None:
            #append target column
            matching['target']=1
            cross_train=cross_train.merge(matching,how='left',left_on=['id_x', 'id_y'],right_on = ['locu_id','foursquare_id'])
            cross_train=cross_train.drop(columns=['locu_id','foursquare_id'])
            cross_train['missing_phone'] = 0
            cross_train.loc[cross_train['phone_x'].isnull() | cross_train['phone_y'].isnull(), 'missing_phone'] = 1
            cross_train['missing_phone'] = cross_train['missing_phone'].astype(int)

            #for obs not in matching csv, give a target value of 0
            cross_train['target']=cross_train['target'].fillna(value=0)
            cross_train=cross_train.fillna(value='')

            cross_train = computingdistance(cross_train)
            cross_train_num = cross_train[['id_x','id_y','name_jaccard_dis','name_jaro_dis','phone_jaro_dis','poscd_jaro_dis',
                                       'addr_jaro_dis', 'loc_jaro_dis', 'missing_phone',
                                       'long_dis', 'lat_dis','target']]
            cross_train_num.target = cross_train_num.target.astype(int)

        if matching is None:
            cross_train['missing_phone'] = 0
            cross_train.loc[cross_train['phone_x'].isnull() | cross_train['phone_y'].isnull(), 'missing_phone'] = 1
            cross_train['missing_phone'] = cross_train['missing_phone'].astype(int)
            cross_train=cross_train.fillna(value='')
            cross_train = computingdistance(cross_train)
            cross_train_num = cross_train[['id_x','id_y','name_jaccard_dis','name_jaro_dis','phone_jaro_dis','poscd_jaro_dis',
                                   'addr_jaro_dis', 'loc_jaro_dis', 'missing_phone',
                                   'long_dis', 'lat_dis']]  
        cross_train_num.loc[cross_train_num['missing_phone'] == 1, 'phone_jaro_dis'] = np.nan

        return cross_train_num
    
    cross_train_num = preprocessing(locu_train, foursquare_train, matching)
    
    #filtering
    cross_train_num = cross_train_num.drop(cross_train_num[(cross_train_num['target']==1)
                        &(cross_train_num['name_jaro_dis']<0.9)
                        &(cross_train_num['name_jaro_dis']*cross_train_num['addr_jaro_dis']<0.4)
                        &(cross_train_num['phone_jaro_dis']!=1)].index)
    #blocking
    def blocking(X):
        df = pd.DataFrame(columns = X.columns)
        for name in np.unique(X['id_x']):
            df_p = X[X['id_x'] == name]
            jaro_threshold = np.percentile(df_p['name_jaro_dis'], 15)
            jaccard_threshold = np.percentile(df_p['name_jaccard_dis'], 85)
            df_p = df_p[(df_p['name_jaro_dis'] > jaro_threshold) & (df_p['name_jaccard_dis'] <= jaccard_threshold)]
            df = pd.concat([df, df_p])
            df['missing_phone'] = df['missing_phone'].astype(int)
            df['target'] = df['target'].astype(int)

        return df


    features = ['name_jaro_dis','phone_jaro_dis','poscd_jaro_dis',
                'addr_jaro_dis', 'loc_jaro_dis', 'lat_dis']
    
    train_blocking = blocking(cross_train_num)
    X_train_blocking = train_blocking[features]
    y_train_blocking = train_blocking["target"]

    #training
    #use best parameters found from gridsearch
    RF = RandomForestClassifier(random_state=0,class_weight='balanced',n_estimators=13,max_depth=12)
    RF_pipeline = make_pipeline(Imputer(), RF)
    RF_pipeline.fit(X_train_blocking, y_train_blocking)
            
    #preprocessing test set
    locu_test = pd.read_json(locu_test_path)
    foursquare_test = pd.read_json(foursquare_test_path)
    matching_test = None
    cross_test = preprocessing(locu_test, foursquare_test, matching_test)
    X_test = cross_test[features]

    #predicting
    y_hat=pd.DataFrame(RF_pipeline.predict(X_test),columns=['target'])

    matches_test = pd.concat([cross_test,y_hat],axis=1)
    matches_test = matches_test[matches_test['target']==1][['id_x','id_y']]
    matches_test.columns=['locu_id','foursquare_id']

    matches_test.to_csv('matches_test.csv',index=False)

In [2]:
get_matches("locu_train.json","foursquare_train.json","matches_train.csv","locu_test.json","foursquare_test.json")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
