In [1]:
import cudf

df = cudf.read_csv("../input/foursquare-location-matching/test.csv")
print(df.shape)
df.head()

(5, 12)


Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories
0,E_00001118ad0191,Jamu Petani Bagan Serai,5.012169,100.535805,,,,,MY,,,Cafés
1,E_000020eb6fed40,Johnny's Bar,40.434209,-80.56416,497 N 12th St,Weirton,WV,26062.0,US,,,Bars
2,E_00002f98667edf,QIWI,47.215134,39.686088,"Межевая улица, 60",Ростов-на-Дону,,,RU,https://qiwi.com,78003011130.0,ATMs
3,E_001b6bad66eb98,"Gelora Sriwijaya, Jaka Baring Sport City",-3.014675,104.794374,,,,,ID,,,Stadiums
4,E_0283d9f61e569d,Stadion Gelora Sriwijaya,-3.021727,104.788628,Jalan Gubernur Hasan Bastari,Palembang,South Sumatra,11480.0,ID,,,Soccer Stadiums


In [2]:
# increase data size for debug mode
if df.shape[0] == 5:
    df = cudf.concat([df]*10).reset_index(drop=True)
    
df.shape

(50, 12)

In [3]:
from cuml.neighbors import NearestNeighbors

coo_cols = ["latitude", "longitude"]

matcher = NearestNeighbors(n_neighbors=5)
matcher.fit(df[coo_cols])


distances, indices = matcher.kneighbors(df[coo_cols])

In [4]:
from cuml.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
V_name = tfidf.fit_transform(df["name"].fillna("noname"))
V_name.shape

(50, 14)

In [5]:
tfidf = TfidfVectorizer()
V_cat = tfidf.fit_transform(df["categories"].fillna("nocategory"))
V_cat.shape

(50, 5)

In [6]:
from cuml import ForestInference


THRESHOLD = 0.5
features = ["dist", "name_sim", "cat_sim"]


xgb_model = ForestInference.load(f"../input/fs-xgb-public-models/xgb_fs.json", output_class=True, model_type="xgboost_json")

In [7]:
dfs = []


for i in range(indices.shape[1]):
    tmp_df = df[["id"]].copy()
    
    tmp_df["dist"] = distances.values[:, i]
    tmp_df["name_sim"] = V_name.multiply(V_name[indices.values[:, i].get()]).sum(axis=1).ravel()
    tmp_df["cat_sim"] = V_cat.multiply(V_cat[indices.values[:, i].get()]).sum(axis=1).ravel()
    tmp_df["match_id"] = df["id"].to_pandas().values[indices.values[:, i].get()]
    
    tmp_df["pred"] = xgb_model.predict_proba(tmp_df[features].to_pandas())[:, 1]
    
    dfs.append(tmp_df[tmp_df["pred"] > THRESHOLD])
    
out_df = cudf.concat(dfs)
out_df.shape

(250, 6)

In [8]:
out_df = df[["id"]].drop_duplicates().to_pandas().append(out_df.to_pandas())

out_df.loc[out_df["match_id"].isnull(), "match_id"] = out_df.loc[out_df["match_id"].isnull(), "id"]

out_df = out_df.groupby("id")["match_id"].apply(list).reset_index()
out_df["matches"] = out_df["match_id"].apply(lambda x: " ".join(set(x)))

out_df.head()

Unnamed: 0,id,match_id,matches
0,E_00001118ad0191,"[E_00001118ad0191, E_00001118ad0191, E_0000111...",E_00001118ad0191
1,E_000020eb6fed40,"[E_000020eb6fed40, E_000020eb6fed40, E_000020e...",E_000020eb6fed40
2,E_00002f98667edf,"[E_00002f98667edf, E_00002f98667edf, E_00002f9...",E_00002f98667edf
3,E_001b6bad66eb98,"[E_001b6bad66eb98, E_001b6bad66eb98, E_001b6ba...",E_001b6bad66eb98
4,E_0283d9f61e569d,"[E_0283d9f61e569d, E_0283d9f61e569d, E_0283d9f...",E_0283d9f61e569d


In [9]:
out_df.to_csv("submission.csv", index=False, columns=["id", "matches"])