<a href="https://colab.research.google.com/github/rodmar8/bullion-erc20-token/blob/main/semantic_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mapping website description to Retail Taxonomy using Semantic analysis with BERT

**Author:** Marco Kormann Rodrigues</br>
**Date created:** 2021/11/12<br>
**Last modified:** 2021/11/12<br>
**Description:** Natural Language mapping using BERT model 

# Environment setup

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
import pandas as pd


# Configuration and Setup

In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
list_matches = []

# Load the Data

In [28]:
website_csv = '/content/drive/MyDrive/project_data/retail_websites_results.csv'
taxonomy_csv = '/content/drive/MyDrive/project_data/taxonomy.csv'

Load taxonomy into a data frame and data table

In [30]:
tdf = pd.read_csv(taxonomy_csv)
tdt = tdf.to_dict(orient='records')

Load website descriptions into data frame and data table

In [31]:
wdf = pd.read_csv(website_csv)
wdt = wdf.to_dict(orient='records')

# Functions

In [26]:
def semantic_similarity(s1, s2):
  sentences = [s1, s2]
  sentence_embeddings = model.encode(sentences)
  return cosine_similarity([sentence_embeddings[0]], sentence_embeddings[1:])[0][0]

In [36]:
# unit test
s1 = "Fractal Analytics helps global Fortune 100 companies power every human decision in the enterprise by bringing analytics and AI to the decision."
s2 = "The use of social networks in the context of e-commerce transactions. eCom, eCommerce."
semantic_similarity(s1, s2)


-0.11022799

# Loop through all nodes in the taxonomy and try to match against partner descriptions

In [None]:
for row in tdt:
  s1 = row['description']

  for desc in wdt:
    s2 = desc['description']

    score = semantic_similarity(s1, s2)
    if score>0.75:
      match = {
          'id': row['id'],
          'reference': row['reference'],
          'url': desc['url'],
          'website_description': desc['description']
      }
      print(match)
      list_matches.append(match.copy())



In [41]:
      match = {
          'id': row['id'],
          'reference': row['reference'],
          'url': desc['url'],
          'website_description': desc['description']
      }
      print(match)

{'id': 'DATA-AIML-DYNP', 'reference': 'Advanced Data Insights:AI/ML:Dynamic Price and Promo', 'url': 'activeviam.com', 'website_description': 'Analyze data in real-time and empower your firm with data-driven decisions. Risk management solutions. Financial regulation compliance.'}


In [None]:
dfl = pd.DataFrame(list_matches)

In [None]:
dfl.to_csv('/content/drive/MyDrive/project_data/retail_websites_taxonomy_matches.csv', index=False, encoding='utf-8')