In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", 100)
import matplotlib.pyplot as plt
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
import seaborn as sns
import pickle 

import datetime
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GroupKFold, KFold

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

from haversine import haversine, Unit
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/foursquare-location-matching/train.csv')
pairs = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv')
# test = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')

In [None]:
## To create the DF summary
def summarytable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Missing_pct'] = df.isnull().mean().round(2).values
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    return summary

print("Displaying train datset ------>")
display(summarytable(train))
print("Displaying pairs datset ------>")
display(summarytable(pairs))

In [None]:
train.isnull().mean().round(2).sort_values(ascending=True).plot(kind='barh', figsize=(10,6), title='Missing Value Percentages')

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (25, 10))
train.isnull().mean().round(2).sort_values(ascending=True).plot(kind='barh',title='Missing Value Percentages', ax=ax[0])
pairs.isnull().mean().round(2).sort_values(ascending=True).plot(kind='barh',title='Missing Value Percentages', ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
print("Displaying train aggragation for categorical variables ------>")
display(train.describe(include='object'))
print("Displaying pairs aggragation for categorical variables ------>")
display(pairs.describe(include='object'))

## Uni and Bi-variate Analyis

In [None]:
fig = make_subplots(rows=1, cols=3, subplot_titles=('id','categories','point_of_interest'))

for e, column in enumerate(['id','categories','point_of_interest'], start=1):
    
    df = train.groupby('name', as_index=False).agg({column:'count'}).sort_values(by=column, ascending=False).nlargest(10, columns=column)
    fig.add_trace(
        go.Bar(x=df['name'], y=df[column], text=df[column].values, name=column),
        row=1, col=e
    )

fig.update_layout(height=500, width=1400, title_text="Top 10 Frequency Distribution of NAME Variables")
fig.show()

In [None]:
display(train.groupby(['country'], as_index=False).agg({'point_of_interest':'count'}).sort_values(by='point_of_interest', ascending=False).nlargest(10, columns='point_of_interest'))
display(train.groupby(['state'], as_index=False).agg({'point_of_interest':'count'}).sort_values(by='point_of_interest', ascending=False).nlargest(10, columns='point_of_interest'))

In [None]:
color_size = np.sort(np.arange(start=10, stop=110, step=10))
color_size = color_size[::-1]

df = train.groupby('city', as_index=False).agg({'point_of_interest':'count'}).sort_values(by='point_of_interest', ascending=False).nlargest(10, columns='point_of_interest')

fig = go.Figure(data=[go.Scatter(
    x=df['city'],
    y=df['point_of_interest'],
    mode='markers',
    marker=dict(
        color=color_size,
        size=color_size,
        showscale=True
        )
    
)])
fig.show() 

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (25, 10))
sns.histplot(train, x="latitude", bins = 30, ax=ax[0])
sns.histplot(train, x="longitude", bins = 30, ax=ax[1])

fig.show()

In [None]:
plt.figure(figsize = (15, 9))
sns.scatterplot(data = train, x = 'longitude', y = 'latitude')
plt.title("Scatterplot of latitude and longitude of training observations", fontsize = 14)
plt.tight_layout()
plt.show()

# Pairs Dataset

In [None]:
pairs1 = pairs.match.value_counts(normalize=True).to_frame()

fig = go.Figure(data=[go.Pie(labels=pairs1.index, values=pairs1.match, hole=.3)])
fig.update_layout(title_text="Match column distribution")
fig.show()

In [None]:
!pip install haversine

In [None]:
pairs['distance_in_kms'] = pairs.apply(lambda x : haversine((x['latitude_1'], x['longitude_1']), (x['latitude_2'], x['longitude_2']), unit='km'), axis=1)

In [None]:
pairs['distance_in_kms'].describe()

In [None]:
plt.scatter(x=np.arange(pairs.shape[0]), y=np.sort(pairs['distance_in_kms']))

## Distance between points vs. match rate

In [None]:
pairs.assign(dist_grp = lambda x: pd.qcut(x['distance_in_kms'],20))\
    .groupby('dist_grp')\
    .agg({'match':['mean']}).plot(kind='bar', 
                                  title='Distance(km) vs match rate', 
                                  figsize = (15,6),
                                  legend=False)

In [None]:
print(f" Match rate for distance = 0km is {pairs[pairs.distance_in_kms == 0].match.mean().round(2)*100}%")

U+1F913

#### MORE TO COME!!!