Looking at some exploratory data analysis to understand features and labels. 

Crafting some new features and assessing their impact on match rate
- Distance between points
- Similarity between names (multiple distance functions)
- Same country / zip codes, etc
- Whether there are BAD labels?
- Decision tree to see how different features interact to give high match rate


In [None]:

import numpy as np 
import pandas as pd 
pd.options.display.max_columns = 100
import os
from haversine import haversine
import Levenshtein
from fuzzywuzzy import fuzz
from pandas.plotting import scatter_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

import folium
from matplotlib import pyplot as plt
from IPython.display import display


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Read data

In [None]:
pairs = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv')
print(pairs.shape)
pairs.head().T

# Label distribution

In [None]:
pairs.match.value_counts(normalize = True).plot(kind='bar', title='Label distribution')

# 68.8% are Match = True

# Are there missing values in the data?

### Features like name, lat, long, country have no missing value in pairs data

### Features like phone, url, zip, city, state has high number of missing values

In [None]:
(pairs.isna().sum(axis=0) / len(pairs)).plot(kind='barh', 
                                             title='missing value %', 
                                             figsize = (10,6))


In [None]:
# Ordered columns for later
cols = ['name_1', 'name_2', 'latitude_1', 'longitude_1','latitude_2', 'longitude_2', 'address_1','address_2','city_1','city_2','state_1','state_2','zip_1','zip_2', 'country_1','country_2','url_1','url_2','phone_1','phone_2', 'categories_1','categories_2','match']


### Lets start by looking at features without missing values

# Create distance feature using lat long

### Not a match example - Though similar names, they are still located at a distance in the map

In [None]:
i=4
display(pairs.head(5).tail(1)[cols])

m = folium.Map(location=[pairs.latitude_1[i], pairs.longitude_1[i]], 
               zoom_start=15,
               tiles = "Stamen Toner",)

tooltip = "Click Here For More Info"

folium.Marker(
    location=[pairs.latitude_1[i], pairs.longitude_1[i]],
    popup=f"<stong> {pairs.name_1[i]} </stong>",
    tooltip=tooltip,
).add_to(m)

folium.Marker(
    location=[pairs.latitude_2[i], pairs.longitude_2[i]],
    popup=f"<stong> {pairs.name_2[i]} </stong>",
    tooltip=tooltip,
).add_to(m)

m

### Match example - Both points are located together

In [None]:
i=3
display(pairs.head(i+1).tail(1))

m = folium.Map(location=[pairs.latitude_1[i], pairs.longitude_1[i]], 
               zoom_start=15,
               tiles = "Stamen Toner",)

tooltip = "Click Here For More Info"

folium.Marker(
    location=[pairs.latitude_1[i], pairs.longitude_1[i]],
    popup=f"<stong> {pairs.name_1[i]} </stong>",
    tooltip=tooltip,
).add_to(m)

folium.Marker(
    location=[pairs.latitude_2[i], pairs.longitude_2[i]],
    popup=f"<stong> {pairs.name_2[i]} </stong>",
    tooltip=tooltip,
).add_to(m)

m

## Hypothesis: If distance is small, chances of match is high

In [None]:

pairs['distance'] = pairs.apply(lambda x: haversine((x['latitude_1'],x['longitude_1']), 
                                                    (x['latitude_2'],x['longitude_2']),
                                                   unit='km'), axis=1)


### Distance between 90% of the points in paired data is less than 2km!

In [None]:
# percentile plot
plt.plot([i for i in range(0,100,5)], 
         np.percentile(pairs['distance'], [i for i in range(0,100,5)]))
plt.xlabel("percentile value (0 to 95)")
plt.ylabel("Distance value")


# Distance between points vs. match rate

- There is a sharp decline as distance increases but for cases with distance more than 1km , match rate increases again 

In [None]:
pairs.assign(dist_grp = lambda x: pd.qcut(x['distance'],20))\
    .groupby('dist_grp')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Distance(km) vs match rate', 
                                  figsize = (15,6),
                                  legend=False)

### Match rate for distance = 0km is 92%

In [None]:
pairs[pairs.distance == 0].match.mean()

### Lets looks at the Match = False cases where distance = 0

In [None]:
pairs[(pairs.distance == 0) & (pairs.match == False)][cols].T

#### Definitely, some bad labels

# Hypothesis: Points with same / similar name should have high chances of Match

### Clear overlap of text observed for Match = True

In [None]:
display(pairs[['name_1', 'name_2','match']].sample(20).sort_values("match", 
                                                                   ascending=False))

# Create Distance between names feature


In [None]:
# Create Levenshtein distance and partial distance features

pairs['distance_name'] = pairs.apply(lambda x: Levenshtein.ratio(x['name_1'], x['name_2']),axis=1)

pairs['distance_name_lower'] = pairs.apply(lambda x: Levenshtein.ratio(x['name_1'].lower(), x['name_2'].lower()),axis=1)

pairs['distance_partial_ratio'] = pairs.apply(lambda x: fuzz.partial_ratio(x['name_1'], x['name_2']),axis=1)


### Comparing the output of different distance functions - outputs are quite different

In [None]:

scatter_matrix(pairs[['distance_name','distance_name_lower','distance_partial_ratio']],
              diagonal='kde',
              alpha = 0.4,
              figsize=(15,8))
plt.show()

In [None]:
display(pairs[(pairs.distance_name < 1) & 
              (pairs.distance_name_lower == 1)][['name_1','name_2','distance_name','distance_name_lower','distance_partial_ratio']].head(10))

display(pairs[(pairs.distance_name_lower < 1) & 
              (pairs.distance_partial_ratio == 100)][['name_1','name_2','distance_name','distance_name_lower','distance_partial_ratio']].head(10))


### Lets look at some example with different match values

In [None]:
display(pairs.loc[pairs.match==False, 
                  ['name_1', 'name_2','match', 
                   'distance_name','distance_name_lower']]\
        .sample(10).sort_values('distance_name', ascending=False))

display(pairs.loc[pairs.match==True, 
                  ['name_1', 'name_2','match', 
                   'distance_name','distance_name_lower']]\
        .sample(10).sort_values('distance_name', ascending=False))


# Plot Name Similarity vs match rate

* Clear trend where high similarity between names has higher match rate

In [None]:

pairs.assign(dist_name_grp = lambda x: pd.cut(x['distance_name'],10))\
    .groupby('dist_name_grp')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Levenshtein distance between names vs match rate', 
                                  figsize = (10,6),
                                  legend=False)

In [None]:
pairs.assign(dist_name_low_grp = lambda x: pd.cut(x['distance_name_lower'],10))\
    .groupby('dist_name_low_grp')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Levenshtein distance between names (lowercase) vs match rate', 
                                  figsize = (10,6),
                                  legend=False)

In [None]:
pairs.assign(dist_name_partial_grp = lambda x: pd.cut(x['distance_partial_ratio'],10))\
    .groupby('dist_name_partial_grp')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Partial fuzz ratio between names (lowercase) vs match rate', 
                                  figsize = (10,6),
                                  legend=False)

### Clear trend on high match rate when word overlap is high 


### Though cases also exist with small word overlap and match 
### Most of these cases, have name written in different language

In [None]:
pairs[(pairs.distance_partial_ratio < 10) & (pairs.match==True)][cols].T

# Hypothesis (Kind of busted) : Same ZIP codes should have high match rate

In [None]:
pairs['same_zip'] = (pairs.zip_1 == pairs.zip_2).astype('int')

# ZIP has MISSING values
pairs.loc[pairs.zip_1.isna() | pairs.zip_2.isna(),'same_zip'] = -1

pairs['same_zip'].value_counts()


In [None]:
pairs.groupby('same_zip')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Same_zip vs match rate', 
                                  figsize = (10,6))

# 70% match rate only!

In [None]:
pairs[(pairs.same_zip == 1) & (pairs.match == False)][cols].T

# BAD LABELS? 

# Hypothesis: same address should have high match rate

In [None]:
pairs['same_address'] = (pairs.address_1.str.lower() == pairs.address_2.str.lower()).astype('int')

# ZIP has MISSING values
pairs.loc[pairs.address_1.isna() | pairs.address_2.isna(),'same_address'] = -1

pairs['same_address'].value_counts()


In [None]:
pairs.groupby('same_address')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Same_address vs match rate', 
                                  figsize = (10,6),
                                  legend=False)

In [None]:
pairs[(pairs.same_address == True) & (pairs.match==False) ][cols].T

In [None]:
# Again some bad labels

# Hypothesis (Busted): Different countries should have very low match rate

In [None]:
pairs['same_country'] = pairs.country_1 == pairs.country_2
pairs['same_country'].value_counts()


In [None]:
pairs.groupby('same_country')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Same_country vs match rate', 
                                  figsize = (10,6))

In [None]:
pairs[(pairs.same_country == False) & (pairs.match==True) ][cols].T

# Hypothesis (Not very significant correlation): Same urls should have high match rate

In [None]:
pairs['same_url'] = (pairs.url_1 == pairs.url_2).astype('int')

# URL has missing values
pairs.loc[pairs.url_1.isna() | pairs.url_2.isna(),'same_url'] = -1

pairs['same_url'].value_counts()


In [None]:

pairs.groupby('same_url')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Same_url vs match rate', 
                                  figsize = (10,6))

In [None]:
pairs[(pairs.same_url == True) & (pairs.match==False)][cols].T

# Decision tree to see how features interact

In [None]:
clf = DecisionTreeClassifier(max_depth = 4, random_state=0)


In [None]:
x_cols = ['distance', 'distance_name', 
          'distance_name_lower','distance_partial_ratio', 
          'same_country', 'same_url','same_zip', 'same_address']

clf.fit(pairs[x_cols][:int(len(pairs)*0.7)], 
        pairs['match'][:int(len(pairs)*0.7)].astype('int'))

In [None]:
plt.figure(figsize=(25,10))

tree.plot_tree(clf,
               feature_names=x_cols,  
               #class_names='match',
               filled=True,
               precision = 1,
               fontsize=10)
plt.show()

In [None]:
print(tree.export_text(clf, feature_names = x_cols, show_weights=True))

# TBC