In [None]:
import pandas as pd
import cudf

In [None]:
# load data
train_df = cudf.read_csv("../input/foursquare-location-matching/train.csv")

# Get all pairs

In [None]:
ids_df = train_df[["id", "point_of_interest"]]
pairs_df = ids_df.merge(ids_df, on="point_of_interest")
del pairs_df["point_of_interest"]
pairs_df.columns = ["id_1", "id_2"]

del ids_df

pairs_df = pairs_df[pairs_df["id_1"] < pairs_df["id_2"]]

# Add other fields

In [None]:
original_columns = train_df.columns

train_df.columns = [f"{col_name}_1" for col_name in original_columns]
pairs_df = pairs_df.merge(train_df, on="id_1")

train_df.columns = [f"{col_name}_2" for col_name in original_columns]
pairs_df = pairs_df.merge(train_df, on="id_2")

pairs_df["point_of_interest"] = pairs_df["point_of_interest_1"]
pairs_df["match"] = (pairs_df["point_of_interest_1"] == pairs_df["point_of_interest_2"])

del pairs_df["point_of_interest_1"], pairs_df["point_of_interest_2"]

# Reorder columns and output csv

In [None]:
pair_columns = list(cudf.read_csv("../input/foursquare-location-matching/pairs.csv").columns)
pair_columns += ["point_of_interest"]

pairs_df = pairs_df[pair_columns].copy()

pairs_df.to_csv("positive_pairs.csv")

pairs_df.shape

# Sample usage for analysis

In [None]:
for col_name in original_columns[:-1]:
    col1 = f"{col_name}_1"
    col2 = f"{col_name}_2"
    match_ratio = (pairs_df[col1] == pairs_df[col2]).mean()
    non_match_ratio = (pairs_df[col1] != pairs_df[col2]).mean()
    null_ratio = (pairs_df[col1].isna() | pairs_df[col2].isna()).mean()
    match_ratio = match_ratio * (1 - null_ratio)
    non_match_ratio = non_match_ratio * (1 - null_ratio)
    
    null_ratio = round(null_ratio, 2)
    match_ratio = round(match_ratio, 2)
    non_match_ratio = round(non_match_ratio, 2)
    
    message = f"{col_name}: {match_ratio} match, {non_match_ratio} don't match"
    if null_ratio > 0.0:
        message += f", and {null_ratio} missing"
    print(message)