# Installing h3 Lib

In [None]:
!pip install ../input/h3lib/h3-3.7.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# Limporting lib

In [None]:
import difflib
import gc
from h3 import h3
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from unidecode import unidecode
from tqdm.notebook import tqdm
import re
import pickle
import gensim.corpora as corpora # Create Dictionary
import gensim
from gensim.utils import simple_preprocess
from pprint import pprint# number of topics


tqdm.pandas()
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
class tool_box:
    def __init__(self):
        self.H3_res = 9
        
    def tokenization(self, text):
        if text:
            tokens = re.split(' ',text)
        return tokens

    def get_lda_cluster(self, category):
        if category:
            new_doc = self.tokenization(str(category))
            new_doc_bow = id2word.doc2bow(new_doc)
            clusters = lda_model.get_document_topics(new_doc_bow)

            scores = []
            for i in clusters:
                scores.append(i[1])

            max_index = scores.index(max(scores))
            return clusters[max_index][0]
        else:
            return 2

    def geo_to_h3(self, row):
        return h3.geo_to_h3(lat=row.latitude,lng=row.longitude,resolution = self.H3_res)

    def search_engine(self, id_poi, name, h3_cell, cluster):
        first_step = training_set_[training_set_['h3_cell'] == h3_cell]
        second_step = first_step[training_set_['category_cluster'] == cluster]

        kw = difflib.get_close_matches(name, second_step['name'], n=3)

        result = second_step[second_step['name'].isin(kw)]
        if result['id'].empty:
            return id_poi
        else:
            sep = ' '
            match_formated = id_poi
            for match in result['id']:
                match_formated += sep 
                match_formated += match
            return match_formated


In [None]:
tool = tool_box()

# Data importing (train and test sets)

In [None]:
training_set_ = pd.read_csv('../input/training-set-foursquare/training_set.csv')

test_set_ = pd.read_csv('../input/foursquare-location-matching/test.csv')

In [None]:
test_set_.columns


In [None]:
test_set_.drop(['address', 'city', 'state','zip', 'country', 'url', 'phone'], axis=1)

# Data first check

In [None]:
# normalizing name to avoid mismatch due to accents or punctuations
test_set_['name'] = test_set_['name'].astype(str)
test_set_['name'] = test_set_['name'].apply(unidecode)
test_set_['name'] = test_set_['name'].str.replace('[^\w\s]','',regex=True)
test_set_['name'] = test_set_['name'].str.lower()

# normalizing categories to avoid mismatch due to accents or punctuations
test_set_['categories'] = test_set_['categories'].astype(str)
test_set_['categories'] = test_set_['categories'].str.lower()

In [None]:
collected = gc.collect() # or gc.collect(2)
print("Garbage collector: collected",
          "%d objects." % collected)

# Feature Engineer

## So, considering the number of categories available, it was unviable to classify all of them by myself. After some research, I found the LDA method that allows to cluster of words into categories. (check: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21).



In [None]:
test_set_['category_tokenied']= test_set_['categories'].apply(lambda x: tool.tokenization(x))

In [None]:
id2word = corpora.Dictionary.load('../input/training-set-foursquare/dictionary.gensim') # Create Disctionary

loaded_model = pickle.load(open('../input/training-set-foursquare/corpus.pkl', 'rb'))
corpus = loaded_model # Create Corpus


In [None]:
lda_model = gensim.models.LdaMulticore.load('../input/training-set-foursquare/model_trained', mmap='r')
doc_lda = lda_model[corpus]

## Let's check how to get the cluster from this model

## So get_document_topics returns a list of tuples with the percent of similarity with each cluster. Let's use it to apply for all the row by definig a function and them apply it. 

In [None]:
test_set_['category_cluster'] = test_set_.progress_apply(lambda x: tool.get_lda_cluster(x['categories']),axis=1)


### My approach was using the h3 lib created by Uber (check documentation here: https://github.com/uber/h3) to cluster POIs into hex using the method h3.geo_to_h3 first. It will help us to check in a certain hex all the places.  
### It allows to search new POIs according the hex_cell id which combined to others features to solve eventually conflits, can solve this bussiness problem. 

In [None]:
collected = gc.collect() # or gc.collect(2)
print("Garbage collector: collected",
          "%d objects." % collected)

## Creating the Hex_cell ids

In [None]:
test_set_['h3_cell'] = test_set_.progress_apply(tool.geo_to_h3,axis=1)

In [None]:
collected = gc.collect() 
print("Garbage collector: collected",
          "%d objects." % collected)


# Defining the search engine 

### Here I'm going to use all the features that I created to build a engine search. First i'm going to use the hex id to filter all the places of a certain area, than I will filter all of them by the category cluster. Than, to solve eventually conflits, I'm using the difflib.get_close_matches to get the most similar name str. 

In [None]:
test_set_['matches'] = test_set_.apply(lambda x: tool.search_engine(x['id'], x['name'], x['h3_cell'], x['category_cluster']),axis=1)

## As we can see, it was enough to find a pretty good match for two of 5 rows of the sample

In [None]:
submission_sample = test_set_[['id','matches']]


In [None]:
collected = gc.collect() # or gc.collect(2)
print("Garbage collector: collected",
          "%d objects." % collected)


In [None]:
submission_sample

# Sample Submittion

In [None]:
submission_sample.to_csv('submission.csv', index=False)