In [2]:
# Download the “en_core_web_sm” model for spaCy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Import the spaCy package and load the spaCy model: “en_core_web_sm”.
import spacy                                  # Import spaCy library
nlp = spacy.load("en_core_web_sm")            # load spaCy model

In [4]:
# Import the “English” library.
from spacy.lang.en import English                              # Importing library
nlp = English()                                                # Importing model
nlp = spacy.load("en_core_web_sm")                             # Importing model

In [6]:
# Read the input data file and display first 5 lines.
filename = r"C:\Users\91934\Downloads\Sample.txt"
with open(filename, "r") as file:

  contents = file.read()
# Print first 5 lines

for line in contents.splitlines()[:5]:

  print(line)

The customer service was excellent at the store. The manager has helped me at checkout. My friend has been shopping at this location for years. The staff has always been friendly.


In [7]:
# Convert file contents to text, tokenize with NLP, and print few tokens
text_combined = str(contents)                                  # String
doc = nlp(text_combined)                                       # Create NLP object
print([token for token in doc[:10]])                           # Print first 10 tokens

[The, customer, service, was, excellent, at, the, store, ., The]


In [8]:
# Extract the frequency of word count.
from collections import Counter
freq_counts = Counter()
for token in doc:

  freq_counts[token.orth_] += 1                       # Equivalently, token.text

# Print the most common words from the input text.
most_common_words = freq_counts.most_common(15)             # Most common words in document
print(most_common_words)                                      # To print the frequency count

[('.', 4), ('The', 3), ('at', 3), ('has', 3), ('been', 2), ('customer', 1), ('service', 1), ('was', 1), ('excellent', 1), ('the', 1), ('store', 1), ('manager', 1), ('helped', 1), ('me', 1), ('checkout', 1)]


In [9]:
# Perform tokenization for following sentences:
'''
Alice's book was on the table.
He said, 'Hello there!’
We’ll meet at 5:00 PM.
'''
sentences = [
    "Alice's book was on the table.",
    "He said, 'Hello there!’",
    "We’ll meet at 5:00 PM."
]

for s in sentences:

  doc = nlp(s)
print([token.text for token in doc])

['We', '’ll', 'meet', 'at', '5:00', 'PM', '.']
