In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(rc={'figure.figsize':(14,7)})

# We load all the positive pairs - calculated in a different notebook

In [None]:
pos_pairs = pd.read_csv("../input/all-positive-pairs/positive_pairs.csv", low_memory=False)
del pos_pairs[pos_pairs.columns[0]]

# Get euclidian distance - based on latitude and longitude

In [None]:
pos_pairs["lat_dist"] = pos_pairs["latitude_1"] - pos_pairs["latitude_2"]
pos_pairs["long_dist"] = pos_pairs["longitude_1"] - pos_pairs["longitude_2"]
pos_pairs["euclid_dist"] = ((pos_pairs["lat_dist"]**2) + (pos_pairs["long_dist"]**2)) ** 0.5

pos_pairs[["latitude_1", "latitude_2", "lat_dist", "longitude_1", "longitude_1", "long_dist", "euclid_dist"]]

# Regular histogram/boxplot shows vast majority are very close
# But visualization is not that helpful...

In [None]:
pos_pairs["euclid_dist"].plot.hist(bins=80)
plt.show()
sns.boxplot(x=pos_pairs["euclid_dist"])
plt.show()
pass

# Plotting on log scale is much more informative
# (top values and zeros clipped)

In [None]:
near_bottom_value, near_top_value = np.percentile(pos_pairs["euclid_dist"],[0.1, 99.9])

plotting_distances = pos_pairs["euclid_dist"].copy()
plotting_distances[plotting_distances <= near_bottom_value] = 0.000001 # can't be zero for log
plotting_distances[plotting_distances >= near_top_value] = near_top_value

log_plotting_distances = np.log(plotting_distances)

In [None]:
def round_log_tick_labels(labels):
    rounded_labels = labels.copy()
    for idx in range(len(rounded_labels)):
        round_num = 0
        while round(rounded_labels[idx],round_num) == 0:
            round_num += 1

        rounded_labels[idx] = round(rounded_labels[idx], round_num)
        if round_num == 0:
            rounded_labels[idx] = int(rounded_labels[idx])
            
    return rounded_labels

def format_log_labels(ax):
    labels = [np.exp(item) for item in ax.get_xticks()]
    rounded_labels = round_log_tick_labels(labels)    
    ax.set_xticks(ax.get_xticks().tolist())
    ax.set_xticklabels(rounded_labels)

In [None]:
ax = log_plotting_distances.plot.hist(bins=80, figsize=(14,7))
format_log_labels(ax)

In [None]:
ax = sns.boxplot(x=log_plotting_distances)

format_log_labels(ax)