### NLP Feature Engineering

In [1]:
import numpy as nlp
import pandas as pd

import spacy
from spacy.language import Language

Default spaCy NLP pipeline

In [2]:
#Create the nlp object
nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_names)
print(nlp.pipeline)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000002A6017126F0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000002A601712090>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x000002A6017173E0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000002A6019B6A10>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000002A601A167D0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x000002A6017174C0>)]


Define a custom component in the NLP pipeline

In [3]:
# Define a custom component
# Make a function execute automatically when you call nlp

@Language.component("custom_component")
def custom_component_function(doc):
    # Print the doc's length
    print("Doc length:", len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe("custom_component", first=True)

# Print the pipeline component names
print("Pipeline:", nlp.pipe_names)


Pipeline: ['custom_component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


Customising the poistion of the custom component in the pipeline

In [4]:
# # last: If True, add last
# nlp.add_pipe("custom_component", last=True)

# # first: If True, add first
# nlp.add_pipe("custom_component", first=True)

# # before: Add before componenet
# nlp.add_pipe("custom_component", before="ner")

# # after: Add after componenet
# nlp.add_pipe("custom_component", after="tagger")

### Word Embeddings

Semantic Similarity

Using spaCy model

In [5]:
import spacy 

# To use word vectors, install larger models ending in md or lg
# en_core_web_md or en_core_web_lg

# Run the next line only the first time to download
#! python -m spacy download en_core_web_md

# Load spaCy model with pre-trained word embeddings
nlp = spacy.load("en_core_web_md")

# Process the sentence to obtain Doc objects
doc1 = nlp("I like cats and dogs")
doc2 = nlp("I love all animals")

# Access the vector representations of the entire sentences
embeddings1, embeddings2 = doc1.vector, doc2.vector

# Calculate the similarity between the embeddings
similarity = doc1.similarity(doc2)

# Print the similarity
print("Similarity between the sentences:",similarity)

Similarity between the sentences: 0.8570134262541451


Similarity between sentences

In [6]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Example sentences
sentences = [
    "The tourism industry is collapsing",
    "The COVID-19 travel shock hit  tourism-dependent economies hard",
    "Poaching and illegal wildlife trafficking trends in Southern Africa",
]

# Query
query = "The collapse of tourism and its impact on wildlife"

# Compute sentence embeddings
sentence_embeddings = [nlp(sentence).vector for sentence in sentences]

# Convert sentence embeddings to numpy array
sentence_embeddings = np.array(sentence_embeddings)

# Create NearestNeighbors model
k = 2  # Number of nearest neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
nn_model.fit(sentence_embeddings)

# Query for nearest neighbors
query_embedding = nlp(query).vector.reshape(1, -1)  # Reshape for compatibility with sklearn
distances, indices = nn_model.kneighbors(query_embedding)

# Print nearest neighbors
print('spaCy similarity')
print("Query:", query)
print("Nearest neighbors:",k)
for i, index in enumerate(indices[0]):
    print(sentences[index], "- Distance:", distances[0][i])

spaCy similarity
Query: The collapse of tourism and its impact on wildlife
Nearest neighbors: 2
Poaching and illegal wildlife trafficking trends in Southern Africa - Distance: 0.2246145
The tourism industry is collapsing - Distance: 0.3102746


Bag of Words

In [7]:
import pandas as pd

# Use steps of a recipe as phrases
corpus = [
   'Preheat the oven',
   'lightly spray the baking dish', 
   'combine the sugar, flour, cocoa powder, chocolate chips',
   'Sprinkle the dry mix',
   'Pour the batter',
]

# import and instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# apply the vectorizer to the corpus
X = vectorizer.fit_transform(corpus)

# display the document-term matrix as a 
# pandas dataframe to show the tokens
vocab = vectorizer.get_feature_names_out()
docterm = pd.DataFrame(X.todense(), columns=vocab)
print(vocab)
docterm

['baking' 'batter' 'chips' 'chocolate' 'cocoa' 'combine' 'dish' 'dry'
 'flour' 'lightly' 'mix' 'oven' 'pour' 'powder' 'preheat' 'spray'
 'sprinkle' 'sugar' 'the']


Unnamed: 0,baking,batter,chips,chocolate,cocoa,combine,dish,dry,flour,lightly,mix,oven,pour,powder,preheat,spray,sprinkle,sugar,the
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1
1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1
2,0,0,1,1,1,1,0,0,1,0,0,0,0,1,0,0,0,1,1
3,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1
4,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

# Example sentences
sentences = [
    "The tourism industry is collapsing",
    "The COVID-19 travel shock hit  tourism-dependent economies hard",
    "Poaching and illegal wildlife trafficking trends in Southern Africa",
]

# Query
query = "The collapse of tourism and its impact on wildlife"

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the sentences to obtain count vectors
sentence_vectors = vectorizer.fit_transform(sentences)

# Transform the query to obtain its count vector
query_vector = vectorizer.transform([query])

# Create NearestNeighbors model
k = 2  # Number of nearest neighbors to find
nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
nn_model.fit(sentence_vectors)

# Query for nearest neighbors
distances, indices = nn_model.kneighbors(query_vector)

# Print nearest neighbors
print('BoW similarity')
print("Query:", query)
print("Nearest neighbors:",k)
for i, index in enumerate(indices[0]):
    print(sentences[index], "- Distance:", distances[0][i])

BoW similarity
Query: The collapse of tourism and its impact on wildlife
Nearest neighbors: 2
The tourism industry is collapsing - Distance: 0.5527864045000421
Poaching and illegal wildlife trafficking trends in Southern Africa - Distance: 0.6666666666666667


TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
new_corpus = [
   "The quick brown fox jumps over the lazy dog",
   "The dog barks at the fox",
   "The fox is quick and the dog is lazy"
]

# import and instantiate the BoW vectorizer
bow_vectorizer = CountVectorizer()
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Learn the vocabulary and transform the documents into a BoW and TF-IDF matrix
bow_matrix = bow_vectorizer.fit_transform(new_corpus)
tfidf_matrix = vectorizer.fit_transform(new_corpus)

# Get the vocabulary (unique words) and their corresponding indices
bow_vocabulary = bow_vectorizer.get_feature_names_out()
vocabulary = vectorizer.get_feature_names_out()

# display the document-term matrix as a 
# pandas dataframe to show the tokens
bow_docterm = pd.DataFrame(bow_matrix.todense(), columns=bow_vocabulary)
docterm = pd.DataFrame(tfidf_matrix.todense(), columns=vocabulary)

In [10]:
bow_docterm

Unnamed: 0,and,at,barks,brown,dog,fox,is,jumps,lazy,over,quick,the
0,0,0,0,1,1,1,0,1,1,1,1,2
1,0,1,1,0,1,1,0,0,0,0,0,2
2,1,0,0,0,1,1,2,0,1,0,1,2


In [11]:
docterm

Unnamed: 0,and,at,barks,brown,dog,fox,is,jumps,lazy,over,quick,the
0,0.0,0.0,0.0,0.400008,0.236251,0.236251,0.0,0.400008,0.304216,0.400008,0.304216,0.472502
1,0.0,0.494289,0.494289,0.0,0.291935,0.291935,0.0,0.0,0.0,0.0,0.0,0.58387
2,0.34816,0.0,0.0,0.0,0.205629,0.205629,0.696321,0.0,0.264785,0.0,0.264785,0.411258
