In [50]:
%pip install spacy spacytextblob

Collecting spacytextblob
  Downloading spacytextblob-5.0.0-py3-none-any.whl.metadata (4.8 kB)
Downloading spacytextblob-5.0.0-py3-none-any.whl (4.2 kB)
Installing collected packages: spacytextblob
Successfully installed spacytextblob-5.0.0


In [33]:
import re
import spacy

#
# First we need to get the existing model
#
nlp_example = spacy.load("en_core_web_sm")


In [34]:
#
# Come up with a sentence which would help us show examples for most of the topics.
#
text_input = """
I am Shreyas Jain and am from Karnataka. I was born in the year 1991, currently I'm working as Principle SDET.
This is just a temporary piece of paragraph. Ideally, this will be a huge data.
Since these sentences serves as examples, again and again I am adding more words to this paragraph to identify and use this as an example to help us understand the concept.
NOTE: THIS IS AN INTENTIONAL ATTEMPT TO MAKE THINGS MORE DIFFICULT.
100 Words or more.
"""


In [45]:
#
# Tokenization: Convert a sentence to tokens
#
nlp_tokens = nlp_example(text_input)
tokens = [token.text for token in nlp_tokens]
print("Total number of tokens available : ", len(tokens))


Total number of tokens available :  100


In [46]:
#
# Stop words removal and lowercasing
#
stop_words_removed_n_lower_case = [token.text.lower() for token in nlp_tokens if not token.is_stop]
print("Total number of tokens after removing stop words and making it lowercase:", len(stop_words_removed_n_lower_case))


Total number of tokens after removing stop words and making it lowercase: 52


In [47]:
# Remove duplicate tokens by converting the list to a set and then back to a list
unique_tokens = list(set(stop_words_removed_n_lower_case))
print("Total number of unique tokens:", len(unique_tokens))


Total number of unique tokens: 37


In [48]:
#
# We can remove the special characters if needed.
#
cleaned_unique_tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in unique_tokens]
cleaned_unique_tokens = [token for token in cleaned_unique_tokens if token]
print("Unique tokens after removing special characters:", len(cleaned_unique_tokens))


Unique tokens after removing special characters: 33


In [49]:
#
# Parts of speech tagging
#
for each_token in cleaned_unique_tokens[:5]:
  print(f"Token {each_token}, belongs to POS: {nlp_example(each_token)[0].pos_}")


Token concept, belongs to POS: NOUN
Token karnataka, belongs to POS: PROPN
Token help, belongs to POS: VERB
Token temporary, belongs to POS: ADJ
Token sdet, belongs to POS: NOUN


In [41]:
#
# Check the Named Entitiy Recogniation for the given tokens
#
for each_token in nlp_tokens.ents:
  print(f"Token {each_token}, recognized as : {each_token.label_}")

Token Shreyas Jain, recognized as : PERSON
Token Karnataka, recognized as : GPE
Token the year 1991, recognized as : DATE


In [None]:
#
# Sentiment Analysis for the following statements
#
from spacytextblob.spacytextblob import SpacyTextBlob
nlp_example.add_pipe('spacytextblob')

In [66]:
def analysis_result(value):
  if value > 0:
    return "Positive"
  elif value < 0:
    return "Negative"
  else:
    return "Neutral"

In [67]:
text_input = "These topics are very difficult to understand"
sentiment_result = nlp_example(text_input)._.blob.sentiment_assessments.assessments[0][1]
print(f"Sentiment score for '{text_input}': {sentiment_result} i.e., {analysis_result(sentiment_result)}")


text_input = "These topics are very easy to understand"
sentiment_result = nlp_example(text_input)._.blob.sentiment_assessments.assessments[0][1]
print(f"Sentiment score for '{text_input}': {sentiment_result} i.e., {analysis_result(sentiment_result)}")

Sentiment score for 'These topics are very difficult to understand': -0.65 i.e., Negative
Sentiment score for 'These topics are very easy to understand': 0.5633333333333334 i.e., Positive
