# 1. Data understanding

In [None]:
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_csv("/kaggle/input/foursquare-location-matching/train.csv")
pairs_df = pd.read_csv("/kaggle/input/foursquare-location-matching/pairs.csv")

## 1.1 columns in train_df


In [None]:
train_df.dtypes

In [None]:
pairs_df.head()

In [None]:
train_df.head()

## 1.2 Checking the number of null entries in each column

In [None]:
(train_df.isna().sum()* 100/len(train_df)).sort_values()

In [None]:
train_df.describe()

## 1.5 check for invalid latitude and longitude

In [None]:
print(f" Invalid latitudes: {sum(np.abs(train_df.latitude) >= 90)}") 

In [None]:
print(f" Invalid longitudes: {sum(np.abs(train_df.longitude) >= 180)}") 

## DATA Conclusion:
1. All the places has latitude, longitude, name
2. % data present in columns > country > categories > city > address > state > zip > phone > url
3. Latitude and longitude seems to be in valid ranges from -90 to 90 and -180 to 180 respectively.
4. Almost all columns are string type expect latitude, longitude and zip (need to be type casted).
5. There are other languages used apart from english. 


# 3. Business understanding

## 3.1 Based on lat, long; which part of the world does the records come from?

In [None]:
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame

geometry = [Point(xy) for xy in zip(train_df['longitude'], train_df['latitude'])]
gdf = GeoDataFrame(train_df, geometry=geometry)   

#this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
gdf.plot(ax=world.plot(figsize=(10, 6)), marker='o', color='red', markersize=15);

interestingly enough, some of the points are in ocean/in antartica continents

### Calculating distance between pairs

In [None]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6371* c
    return km

In [None]:
pairs_df['distance'] = pairs_df.apply(lambda row: haversine(row['longitude_1'], 
                                                            row['latitude_1'],
                                                            row['longitude_2'], 
                                                            row['latitude_2'] ), axis=1)

distances can be high even if it is a match => (lat, long) data is unreliable

## 3.2 Can the distance be 0 and still no match?
Ans : yes, but only few

In [None]:
# no match but distance < 1 km

not_match = pairs_df[pairs_df['match'] == False]
(not_match['distance'] < 1).value_counts()

In [None]:
print(f'Number of no match pair on same (lat, long) : {sum(not_match["distance"] == 0 )}')

## 3.2 Can there be match if distance > 1 km?
Ans :  Yes

In [None]:
# number of matches beyond 1km
match = pairs_df[pairs_df['match'] == True]
(match['distance'] > 1).value_counts()

In [None]:
# matched but distance > 1 km
matched_pairs_1km_dist = pairs_df[pairs_df['match'] == True][pairs_df['distance'] > 1]

In [None]:
matched_pairs_1km_dist[['address_1','address_2', 'city_1', 'city_2', 'state_1', 'state_2',
       'zip_1', 'zip_2',  'country_1', 'country_2',  'url_1',  'url_2', 'phone_1', 'phone_2', 'categories_1', 'categories_2']]

In [None]:
print(f'Number of matched pair with dist > 1 km : {sum(match["distance"] > 1 )}')

## 3.3 Are there common latitude, longitude values?
Ans : Yes

In [None]:
train_df[['latitude', 'longitude']].groupby(['latitude', 'longitude']).size().sort_values(ascending=False).head(50)

Values for (38.415602  -95.925751) :

In [None]:
train_df[np.round(train_df.latitude, 6) ==  38.415602][np.round(train_df.longitude, 6) ==  -95.925751]

In [None]:
train_df.columns

## 3.4 What are the categories involved?

In [None]:
top_20_cat = train_df.categories.str.split(',').explode().reset_index()\
                    .groupby('categories')\
                    .size()\
                    .sort_values(ascending=False)[:20]
                    

top_20_cat.plot(kind= 'bar', rot=90)

In [None]:
top_20_cat

## 3.5 What is data fill rate for top 20 present categories?

In [None]:
def  in_top_20_cat(cats):
    if cats is None:
        return False
    if type(cats) == float:
        return False
    cats_set = set(cats.split(","))
    return len(cats_set.intersection(set(top_20_cat.index))) > 0 

train_df['in_top_20_cat'] = train_df.categories.apply(in_top_20_cat)


In [None]:
top_20_cat_df = train_df[train_df.in_top_20_cat]
(top_20_cat_df.isna().sum()* 100/len(top_20_cat_df)).sort_values()

## 3.5 How many countries ?
Ans:  221

In [None]:
print(f"Total number of contries in the data: {train_df.country.nunique()}")

## 3.6 Top 20 countries data wise?
Ans:  Long tailed distribution here

In [None]:
train_df.groupby("country").size().sort_values(ascending=False)[:20].plot(kind= 'bar', rot=90)

# Business understanding Conclusion:
1. Data is present from across the glode, although some locations are water bodies; so latitude and longitude might not be correct for all.
2. 19587 different data points are within 1 km range from each other, showing that lot of data points are nearby.
3. 75978 matching pairs are there with distance > 1 km, again question the reliability soleley on location data.
4. 574 data points are such that they have same location.
5. (38.415602  -95.925751) is most common location in dataset repeating 1437 times.
6. All these points above resisting me to soleley rely on location data for matching.
7. Residential Buildings (Apartments / Condos), Banks are top 2 categories.
8. Data is from 221 countries, but it is heavily skewed to countries like US, TR etc.
9. Apart from location data, categories, country and address-like information can be useful for feature engineering.