In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')
%load_ext Cython

In [None]:
pair_df = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv')

# LCS implementations

In [None]:
%%cython
# Original LCS implementation
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [None]:
%%cython
# Optimized version
import numpy as np
cimport numpy as np

import cython
from libc.stdlib cimport malloc, free


@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def fast_LCS(str S, str T):
    if len(S) < len(T):
        S, T = T, S

    cdef int i, j
    cdef np.uint16_t[:] dp_prev, dp_curr
    
    dp_prev = np.zeros(len(T) + 1, dtype=np.uint16)
    dp_curr = np.zeros(len(T) + 1, dtype=np.uint16)

    for i in range(len(S)):
        for j in range(len(T)):
            dp_curr[j + 1]  = max(dp_prev[j] + (1 if S[i] == T[j] else 0), dp_curr[j], dp_prev[j + 1])
        dp_prev, dp_curr = dp_curr, dp_prev
    return dp_prev[len(T)]

In [None]:
import numba as nb
@nb.jit(cache=True)
def LCS_arr(str_arr_1, str_arr_2):
    out = np.empty(str_arr_1.size, dtype=np.uint16)
    for i in range(str_arr_1.size):
        out[i] = fast_LCS(str_arr_1[i], str_arr_2[i])
    return out

In [None]:
%%time
LCS_result_original = []
for i in range(pair_df.shape[0]):
    LCS_result_original.append(LCS(pair_df['name_1'].iloc[i], pair_df['name_2'].iloc[i]))

In [None]:
%%time
LCS_result_fast = LCS_arr(pair_df['name_1'].values, pair_df['name_2'].values)

# Haversine Implementations

In [None]:
# original implementation
def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [None]:
# fast implementation
import numexpr as ne
from numpy import arcsin, sqrt, sin, cos
def fast_haversine(lats1, lats2, longs1, longs2):
    radius_multiplier = 6371 * 2
    lat_1, lat_2 = np.radians(lats1), np.radians(lats2)
    lon_1, lon_2 = np.radians(longs1), np.radians(longs2)
    expr = """arcsin(sqrt(sin((lat_2 - lat_1) / 2) ** 2
              + (sin((lon_2 - lon_1) / 2) ** 2 * cos(lat_1) * cos(lat_2)))) * radius_multiplier"""
    return ne.evaluate(expr)

In [None]:
%%timeit
vectorized_haversine(pair_df['latitude_1'].values, pair_df['latitude_2'].values,
                     pair_df['longitude_1'].values, pair_df['longitude_2'].values)

In [None]:
%%timeit
fast_haversine(pair_df['latitude_1'].values,pair_df['latitude_2'].values,
               pair_df['longitude_1'].values,pair_df['longitude_2'].values)