# <b><span style='color:#FFFFFF;background-color:#00704A'><font face="Arial Black">★ OVERVIEW ★</font></span></b>

In the training data, the most frequent name is 'Starbucks'.  
However, we can find some variants of it such as 'Starbucks Coffee', 'Старбакс', '星巴克', 'สตาร์บัคส์', and 'スターバックス'.  
In this notebook, I would like to show a way to find out these name variants of 'Starbucks' as well as other shops and stores, which might be used in preprocessing of data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# <b><span style='color:#FFFFFF;background-color:#00704A'><font face="Arial Black">★ DATA PROCESSING ★</font></span></b>

In [None]:
## data
data = pd.read_pickle('../input/4sq-2fold-train-data/train.pkl')
data = data.loc[pd.notna(data['name']), ['point_of_interest', 'name', 'country']]
display(data.head(), data.shape)

In [None]:
## frequency of name
vc = data['name'].value_counts().reset_index()
vc.columns = ['name', 'frequency']
plt.figure(figsize=(8, 6))
sns.barplot(y='name', x='frequency', data=vc.head(12))

In [None]:
## sets of names denoting the same point of interest
df = data.groupby('point_of_interest')['name'].apply(set).reset_index()
df['size'] = df['name'].apply(len)
df = df.loc[df['size'] > 1]
display(df.head())

In [None]:
## frequency of match
sets = df.loc[df['size'] > 1, 'name'].tolist()

from itertools import combinations
from collections import defaultdict

match_freqs = defaultdict(int)
for s in sets:
    for x, y in combinations(s, 2):
        match_freqs[(x, y)] += 1
        match_freqs[(y, x)] += 1

df = pd.DataFrame({'name':match_freqs.keys(), 'match_frequency':match_freqs.values()})
df['name1'] = df['name'].apply(lambda x: x[0])
df['name2'] = df['name'].apply(lambda x: x[1])
df = df[['name1', 'name2', 'match_frequency']]

df.sort_values('match_frequency', ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)

df = df.merge(vc.rename(columns={'name':'name1'}), on='name1', how='left').rename(columns={'frequency':'frequency1'})
df = df.merge(vc.rename(columns={'name':'name2'}), on='name2', how='left').rename(columns={'frequency':'frequency2'})
df.head()

In [None]:
df.sort_values(['frequency1'], ascending=False, inplace=True)
df.drop_duplicates('name2', keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)
df = df[df['frequency1'] > df['frequency2']]
df.head()

# <b><span style='color:#FFFFFF;background-color:#00704A'><font face="Arial Black">★ NAME VARIANTS ★</font></span></b>

In [None]:
for i in range(30):
    target = vc.loc[i, 'name']
    print(f"\n■ Name Variants of '{target}'")
    temp = df.loc[(df['name1'] == target) & (df['match_frequency'] > 1)]
    display(temp)

# <b><span style='color:#FFFFFF;background-color:#00704A'><font face="Arial Black">★ HOW TO USE ★</font></span></b>

In [None]:
from collections import defaultdict
variant2name = defaultdict(str, df.set_index('name2')['name1'].to_dict())
pickle.dump(variant2name, open(f'variant2name.pkl', 'wb'))

In [None]:
# 'variant2name' returns the most frequently matched name.
variant2name['スタバ']

In [None]:
# If there is no more frequent alternative for a name, 'variant2name' returns ''.
variant2name['Starbucks']

In [None]:
data['mapped_name'] = data['name'].map(variant2name)
data[data['mapped_name'] != ''].head(30)

In [None]:
data.loc[data['mapped_name'] == '', 'mapped_name'] = data.loc[data['mapped_name'] == '', 'name']
data.head()