In [None]:
!unzip ../input/foursquare-utils-checkpoints-v4/pykakasi_deps.dontopenthiskaggle -d .

In [None]:
!conda install ./pykakasi_deps/offline_pykakasi.tar.bz2
!conda install ./pykakasi_deps/offline_jaconv.tar.bz2
!conda install ./pykakasi_deps/offline_deprecated.tar.bz2

In [None]:
UTIL_PATH = "../input/foursquare-utils-checkpoints-v4"
DF_PATH = "../input/foursquare-location-matching/test.csv"
SUB_PATH = "../input/foursquare-location-matching/sample_submission.csv"

In [None]:
from sklearnex import patch_sklearn

patch_sklearn()

from sklearn.neighbors import NearestNeighbors, BallTree
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import cython

import Levenshtein
import difflib
from tqdm.notebook import tqdm

tqdm.pandas()

import gc
import multiprocessing

import pykakasi

import sys
sys.path.insert(0, UTIL_PATH) 

from catboost import CatBoostClassifier

## Extracting Nearest Neighbors

In [None]:
def calc_dists_and_indices(df: pd.DataFrame,
                           N: int,
                           cols: list) -> (np.ndarray, np.ndarray):
    
    df[cols] = np.deg2rad(df[cols])
    matcher = NearestNeighbors(n_neighbors=N,
                               metric="haversine",
                               n_jobs=-1)
    matcher.fit(df.loc[:, cols])
    distances, indices = matcher.kneighbors(df.loc[:, cols])

    return distances, indices

In [None]:
df = pd.read_csv(DF_PATH).reset_index()
coo_cols = ["latitude", "longitude"]

N = 12 if len(df)>30000 else 2

distances, indices = calc_dists_and_indices(df=df,
                                            N=N,
                                            cols=coo_cols)

## Alphabet Conversion

In [None]:
def convert_japanese_alphabet(df: pd.DataFrame):
    kakasi = pykakasi.kakasi()
    kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
    kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
    kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
    conversion = kakasi.getConverter()

    def convert(row):
        for column in ["name", "address", "city", "state"]:
            try:
                row[column] = conversion.do(row[column])
            except:
                pass
        return row

    df[df["country"] == "JP"] = df[df["country"] == "JP"].progress_apply(convert, axis=1)
    return df

In [None]:
df = convert_japanese_alphabet(df)

## Extracting Essential Similarities

In [None]:
def textcol_tfidf(df: pd.DataFrame,
                  cols: list) -> np.ndarray:
    V = dict()
    for col in cols:
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
        V[col] = tfidf.fit_transform(df[col].astype(str).fillna(f"no{col}").values)
        print(col, V[col].shape)
    return V

def cat_tfidf(df: pd.DataFrame) -> np.ndarray:
    tfidf = TfidfVectorizer(use_idf=False)
    V_cat = tfidf.fit_transform(df["categories"].fillna("nocategory").values)
    print("categories", V_cat.shape)
    return V_cat


In [None]:
def calc_essential_feats(df: pd.DataFrame,
                         indices,
                         distances,
                         textcol_tfidf,
                         cat_tfidf,
                         train_mode=True):
    dfs = []

    for i in tqdm(range(indices.shape[1])):
        tmp_df = df[["id"]].copy()

        tmp_df["dist"] = distances[:, i]
        tmp_df["cat_sim"] = cat_tfidf.multiply(cat_tfidf[indices[:, i]]).sum(axis=1).A1
        for col in ["name", "address", "url", "phone"]:
            tmp_df[f"{col}_sim"] = textcol_tfidf[col].multiply(textcol_tfidf[col][indices[:, i]]).sum(axis=1).A1

        tmp_df["match_id"] = df["id"].values[indices[:, i]]

        for col in ["address", "url", "phone", "categories"]:
            tmp_df[f"{col}_null"] = df[col].isnull() * 1.0 + df[col].isnull().values[indices[:, i]]

        if train_mode:
            tmp_df["match"] = df["point_of_interest"] == df["point_of_interest"].values[indices[:, i]]

        dfs.append(tmp_df)

    candidate_df = pd.concat(dfs)
    candidate_df = candidate_df[candidate_df['id'] != candidate_df['match_id']]

    candidate_df = pd.merge(candidate_df, df[['id',
                                              'name',
                                              'categories',
                                              'phone',
                                              'address']], on='id', how='left')

    candidate_df = candidate_df.rename(columns={'id': 'id_x',
                                                'name': 'name_x',
                                                'categories': 'categories_x',
                                                'phone': 'phone_x',
                                                'address': 'address_x'})

    candidate_df = pd.merge(candidate_df, df[['id',
                                              'name',
                                              'categories',
                                              'phone',
                                              'address']], left_on='match_id', right_on='id',
                            how='left')
    candidate_df = candidate_df.rename(columns={'match_id': 'id_y',
                                                'name': 'name_y',
                                                'categories': 'categories_y',
                                                'phone': 'phone_y',
                                                'address': 'address_y'})

    candidate_df.drop(columns='id', axis=1, inplace=True)

    ids = candidate_df['id_x']
    match_ids = candidate_df['id_y']
    candidate_df.drop(columns=['id_x', 'id_y'], axis=1, inplace=True)

    return ids, match_ids, candidate_df

In [None]:
text_cols = ["address", "url", "phone", "name"]
V_textcols = textcol_tfidf(df=df,
                            cols=text_cols)

V_cat = cat_tfidf(df=df)

ids, match_ids, candidate_df = calc_essential_feats(df=df,
                                                     indices=indices,
                                                     distances=distances,
                                                     textcol_tfidf=V_textcols,
                                                     cat_tfidf=V_cat,
                                                     train_mode=False)

## Extracting Word Similarity Stats

In [None]:
def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            cmin = df[col].min()
            cmax = df[col].max()
            if str(col_type)[:3] == 'int':
                if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif cmin > np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [None]:
def categorical_similarity(A, B):
    if not A or not B:
        return -1

    A = set(str(A).split(", "))
    B = set(str(B).split(", "))

    # Find intersection of two sets
    nominator = A.intersection(B)

    similarity_1 = len(nominator) / len(A)
    similarity_2 = len(nominator) / len(B)

    return max(similarity_1, similarity_2)


@cython.cfunc
def LCS(S: str, T: str):
    i: cython.int
    j: cython.int
    dp: cython.list = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]


def string_operation(s1, s2, op="seq_matcher"):
    if s1 and s2:
        if op == "seq_matcher":
            return difflib.SequenceMatcher(None, s1, s2).ratio()
        elif op == "lev_distance":
            return Levenshtein.distance(s1, s2)
        elif op == "jaro_winkler":
            return Levenshtein.jaro_winkler(s1, s2)
        elif op == "lcs":
            return LCS(str(s1), str(s2))
    else:
        return -1

In [None]:
def _add_distance_features(args):
    _, df = args

    for c in ["name", "categories", "phone", "address"]:
        df[c + "_x"] = df[c + "_x"].astype(str)
        df[c + "_y"] = df[c + "_y"].astype(str)

        df[f"{c}_seqm"] = df[[c + "_x", c + "_y"]].apply(lambda x: string_operation(x[c + "_x"], x[c + "_y"],
                                                                                    op="seq_matcher"),
                                                         axis=1)
        df[f"{c}_leven"] = df[[c + "_x", c + "_y"]].apply(lambda x: string_operation(x[c + "_x"], x[c + "_y"],
                                                                                     op="lev_distance"),
                                                          axis=1)
        df[f"{c}_jaro"] = df[[c + "_x", c + "_y"]].apply(lambda x: string_operation(x[c + "_x"], x[c + "_y"],
                                                                                    op="jaro_winkler"),
                                                         axis=1)
        df[f"{c}_lcs"] = df[[c + "_x", c + "_y"]].apply(lambda x: string_operation(x[c + "_x"], x[c + "_y"],
                                                                                   op="lcs"),
                                                        axis=1)

        df[f"{c}_len"] = df[f"{c}_x"].astype(str).map(len)
        df[f"{c}_len2"] = df[f"{c}_y"].astype(str).map(len)
        df[f'{c}_len_diff'] = np.abs(df[f"{c}_len"] - df[f"{c}_len2"])
        df[f"{c}_nleven"] = df[f'{c}_leven'] / \
                            df[[f'{c}_len', f'{c}_len2']].max(axis=1)

        df = df.drop(columns=f'{c}_len', axis=1)
        df = df.drop(columns=f'{c}_len2', axis=1)

    df["category_venn"] = df[["categories_x", "categories_y"]] \
        .progress_apply(lambda row: categorical_similarity(row.categories_x, row.categories_y),
                        axis=1)

    df = drop_unnecessary_cols(df, ["name", "categories", "phone", "address"])
    gc_clear()

    return df

def add_distance_features(df):
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        df["idx_group"] = df.index // (len(df) / 256)
        len_df_gby = len(df.groupby('idx_group'))
        dfs = pool.imap(_add_distance_features, df.groupby('idx_group'))
        dfs = tqdm(dfs, total=len_df_gby)
        dfs = list(dfs)
    df = pd.concat(dfs)
    df.drop(columns="idx_group", axis=1, inplace=True)
    del dfs
    return df


def drop_unnecessary_cols(df, PAIR_COLS):
    for c in PAIR_COLS:
        df.drop(columns=[c + "_x", c + "_y"], axis=1, inplace=True)
    return df


def extract_features(df):
    df = add_distance_features(df)
    gc_clear()
    df = reduce_memory(df)
    gc_clear()
    return df.reset_index(drop=True)


def gc_clear():
    for i in range(5):
        gc.collect()

In [None]:
candidate_df = extract_features(candidate_df)

## CV Prediction

In [None]:
def pred_w_model(foldnum, df_x):
    model = CatBoostClassifier()
    model.load_model(UTIL_PATH+'/cb_fold_'+str(foldnum))
    pred = model.predict_proba(df_x)[:,1]
    del model
    gc_clear()
    return pred

preds = [pred_w_model(i, candidate_df) for i in tqdm(range(7))] 
thresholds = np.load(UTIL_PATH+"/fold_threshs.npy")
thresholds

## Voting with Threshold-Tuning

In [None]:
candidate_df["pred"] = (
    np.mean(
        [
            (preds[pred_i] >= thresholds[pred_i]).astype(int)
            for pred_i, pred in enumerate(preds)
        ],
        axis=0,
    )
    >= 0.5
).astype(int)
candidate_df['ids']=ids
candidate_df['match_id']=match_ids

## Creating the submission

In [None]:
pred_match_mask = candidate_df.pred==1
matches_list = dict([(ids, list(groupby_df.match_id.values)) for ids, groupby_df in candidate_df[pred_match_mask].groupby("ids")])

In [None]:
subm_df=pd.read_csv(SUB_PATH)
subm_df["matches"] = np.nan

In [None]:
def set_setter(selected_id):
    try:
        return " ".join(matches_list[selected_id])
    except:
        return ""

subm_df['matches'] = subm_df['id'].progress_apply(lambda x: set_setter(x))

### Adding the self match

In [None]:
subm_df['matches'] = subm_df['id'] + " " + subm_df['matches']

### Post-Processing

In [None]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in tqdm(df[['id', 'matches']].values):
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df 

In [None]:
subm_df = post_process(subm_df)

## The submission

In [None]:
subm_df

In [None]:
subm_df.to_csv("submission.csv", index=False)