In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import tqdm

## Acknowledgements

The world-cities database used in this notebook: https://www.kaggle.com/datasets/juanmah/world-cities

The harvesian distance function is edited from https://www.kaggle.com/competitions/foursquare-location-matching/discussion/319853 and https://www.kaggle.com/competitions/recruit-restaurant-visitor-forecasting/discussion/48655

In [None]:
train_df = pd.read_csv("../input/foursquare-location-matching/train.csv")

In [None]:
train_df["city"].isnull().sum()/len(train_df)

In [None]:
print(train_df["latitude"].isnull().sum()/len(train_df))
print(train_df["longitude"].isnull().sum()/len(train_df))

The code above shows that there are 26% percent of city missing, which is about 300k rows.

In this notebook I use latitude and longitude info (which has no missing values) and the world-cities database, to calculate and assign the nearest city to each row

### Helper Functions

In [None]:
def harvesian(x_lats, y_lats, x_longs, y_longs):

    radius = 6371
    lats1 = x_lats.reshape((x_lats.shape[0],1))

    longs1 = x_longs.reshape((x_longs.shape[0],1))

    dlat=np.radians(y_lats - lats1)
    dlon=np.radians(y_longs - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(y_lats)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c

    return d

### Finding Nearst City

In [None]:
cities_df = pd.read_csv("../input/world-cities/worldcities.csv")

In [None]:
y_lats, y_longs = cities_df["lat"].values, cities_df["lng"].values
'''
    n is the batch size of rows to calculate, since calculating the whold df at the same time\
    will produce a matrix of size 1138812*42905, which is very difficult to fit into memory.
    Therefore you can set the value of n based on your RAM.
'''

'''
n = 500
count = 0
while count < len(train_df):

    cut = train_df.loc[count:count+n,:]

    x_lats, x_longs = cut["latitude"].values, cut["longitude"].values

    res = harvesian(x_lats, y_lats, x_longs, y_longs)

    indexes = res.argmin(axis=1)

    train_df.loc[count:count+n,"new_city"] = cities_df.loc[indexes,"city_ascii"].values
    train_df.loc[count:count+n,"new_country"] = cities_df.loc[indexes,"iso2"].values

    count += n
'''

I commented the above code because it runs too slow on kaggle cpu, therefore I just upload the result

If anybody knows ways to speed up this process, please leave a comment

In [None]:
# The result df from the previous cell
new_df = pd.read_csv("../input/code-output/new_train.csv")

## Analyse Differences

In [None]:
# Unique City Values
origin_num = new_df["city"].nunique()
new_num = new_df["new_city"].nunique()
print(f"Origin No of Unique Cities:{origin_num}, New No of Unique Cities:{new_num}")

In [None]:
# Null Values
new_df[["new_city","new_country"]].isnull().sum()

In [None]:
new_df[new_df["new_country"].isnull()]

The null values in new_country are because the iso2 code for Namibia is NA, which is missrecognized by pandas a null values

In [None]:
# Percent of new_country identical to the origin country
print(new_df[new_df["country"] == new_df["new_country"]].shape[0] / len(new_df))
# Percent of new_city identical to the origin city
print(new_df[new_df["city"] == new_df["new_city"]].shape[0] / len(new_df))

It seems that recorded and calculated country info are nearly the same

Cities however are quite different

In [None]:
# let's look at top differences
new_df[new_df["city"] != new_df["new_city"]]["new_city"].value_counts()

In [None]:
new_df[new_df["new_city"] =='Bangkok']["city"].value_counts()[:15]
# According to google map, Pathum Wan, Bang Rak and Wadhama appears to be districts of Bangkok (please correct me if i'm wrong)

In [None]:
new_df[new_df["new_city"] =='Jakarta']["city"].value_counts()[:15]
# It seems that most results are indeed Jakarta

In [None]:
new_df[new_df["new_city"] =='Moscow']["city"].value_counts()[:15]
# Moscow in different languages found in original df
# PS: Please forgive me for not sticking to the official language of each nation, I choose English only because of the uniformity

**The above cells shows that the process of allocate city base on coordinates not only fills nan values from the origin dataframe, but also groups together city names that are supposed to be the same.**

There is an old saying in China, "Cast a brick to attract jade", hope my commonplace work can inspire you to reach amazing results! (Don't forget to share your idea with us!) 