In [112]:
# !pip install spacy transformers
# !pip install sematch
# !pip install sentence_transformers

In [113]:
# !wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [114]:
text = """
        Contemporary climate change includes both the global warming caused by humans, 
        and its impacts on Earth's weather patterns. There have been previous periods of climate change,
        but the current changes are more rapid than any known events in Earth's history.
        The main cause is the emission of greenhouse gases, mostly carbon dioxide (CO2) and methane. 
        Burning fossil fuels for energy use creates most of these emissions. 
        Agriculture, steelmaking, cement production, and forest loss are additional sources.
        Temperature rise is also affected by climate feedbacks such as the loss of sunlight-reflecting snow cover, 
        and the release of carbon dioxide from drought-stricken forests. 
        Collectively, these amplify global warming.
      """

In [167]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

n_gram_range = (1, 3)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
all_candidates = count.get_feature_names()



In [116]:
all_candidates[:10]

['additional',
 'additional sources',
 'additional sources temperature',
 'affected',
 'affected climate',
 'affected climate feedbacks',
 'agriculture',
 'agriculture steelmaking',
 'agriculture steelmaking cement',
 'amplify']

In [117]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)

In [118]:
nouns = set()
for token in doc:
    if token.pos_ == "NOUN":
        nouns.add(token.text)

In [119]:
all_nouns = nouns.union(noun_phrases)

In [120]:
candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))

In [121]:
candidates[:10]

['additional sources',
 'agriculture',
 'burning fossil fuels',
 'carbon',
 'carbon dioxide',
 'cause',
 'cement',
 'cement production',
 'change',
 'changes']

In [122]:
from transformers import AutoModel, AutoTokenizer

In [123]:
model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [124]:
candidate_tokens = tokenizer(candidates, padding=True, return_tensors="pt")
candidate_embeddings = model(**candidate_tokens)["pooler_output"]

In [125]:
candidate_embeddings.shape

torch.Size([51, 768])

In [126]:
text_tokens = tokenizer([text], padding=True, return_tensors="pt")
text_embedding = model(**text_tokens)["pooler_output"]

In [127]:
text_embedding.shape

torch.Size([1, 768])

In [128]:
candidate_embeddings = candidate_embeddings.detach().numpy()
text_embedding = text_embedding.detach().numpy()

In [129]:
from sklearn.metrics.pairwise import cosine_similarity

top_k = 5
distances = cosine_similarity(text_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_k:]]

In [130]:
keywords

['climate',
 'burning fossil fuels',
 'climate feedbacks',
 'climate change',
 'contemporary climate change']

In [132]:
ref_list = ["climate", "fossil", "energy"]

In [131]:
# word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [133]:
# v_ref = word_vectors['contemporary climate change']
# v_orig = word_vectors['climate']
# cosine_similarity([v_ref],[v_orig])[0][0]

In [134]:
# #calculate distance between two sentences using WMD algorithm
# distance = word_vectors.wmdistance('contemporary climate change', 'fossil fuel')

# print ('distance = %.3f' % distance)

In [135]:
# word_vectors.wv.n_similarity('contemporary climate change'.lower().split(), 'fossil fuel'.lower().split())

In [136]:
from sentence_transformers import SentenceTransformer
model_sentence = SentenceTransformer('bert-base-nli-mean-tokens')

In [149]:
keywords_embeddings = model_sentence.encode(keywords)
ref_embeddings = model_sentence.encode(ref_list)

In [180]:
def phrase_match(word_vec_orig, word_vec_reference):
  print(np.count_nonzero(cosine_similarity(word_vec_reference, word_vec_orig) > 0.75))
  if np.count_nonzero(cosine_similarity(word_vec_reference, word_vec_orig) > 0.75) > 1:
    return "Perfect Match"
  else:
    return "No Match"

In [181]:
phrase_match(keywords_embeddings, ref_embeddings)

2


'Perfect Match'