In [14]:
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [17]:
import spacy
import pandas as pd

# Load English tokenizer, POS tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")


In [30]:
text = (
    "Christopher Nolan's new movie was a thrilling experience! It premiered in Los Angeles on July 5, 2024. "
    "The plot had several twists, and the acting by Cillian Murphy and Emily Blunt was top-notch. "
    "Produced by Universal Pictures, the film explores themes of time and identity. "
    "I loved the cinematography by Hoyte van Hoytema, though the ending felt a bit rushed. "
    "Overall, it's a good watch for fans of mystery thrillers."
)

print("Original Text:\n", text)
doc = nlp(text)


Original Text:
 Christopher Nolan's new movie was a thrilling experience! It premiered in Los Angeles on July 5, 2024. The plot had several twists, and the acting by Cillian Murphy and Emily Blunt was top-notch. Produced by Universal Pictures, the film explores themes of time and identity. I loved the cinematography by Hoyte van Hoytema, though the ending felt a bit rushed. Overall, it's a good watch for fans of mystery thrillers.


### a) Tokenization

In [31]:
tokens = [token.text for token in doc]
print("Tokenized Output:\n", tokens)

Tokenized Output:
 ['Christopher', 'Nolan', "'s", 'new', 'movie', 'was', 'a', 'thrilling', 'experience', '!', 'It', 'premiered', 'in', 'Los', 'Angeles', 'on', 'July', '5', ',', '2024', '.', 'The', 'plot', 'had', 'several', 'twists', ',', 'and', 'the', 'acting', 'by', 'Cillian', 'Murphy', 'and', 'Emily', 'Blunt', 'was', 'top', '-', 'notch', '.', 'Produced', 'by', 'Universal', 'Pictures', ',', 'the', 'film', 'explores', 'themes', 'of', 'time', 'and', 'identity', '.', 'I', 'loved', 'the', 'cinematography', 'by', 'Hoyte', 'van', 'Hoytema', ',', 'though', 'the', 'ending', 'felt', 'a', 'bit', 'rushed', '.', 'Overall', ',', 'it', "'s", 'a', 'good', 'watch', 'for', 'fans', 'of', 'mystery', 'thrillers', '.']


### b) Stop-Word Removal

In [32]:
tokens_no_stopwords = [token.text for token in doc if not token.is_stop and token.is_alpha]
print("Without Stopwords:\n", tokens_no_stopwords)


Without Stopwords:
 ['Christopher', 'Nolan', 'new', 'movie', 'thrilling', 'experience', 'premiered', 'Los', 'Angeles', 'July', 'plot', 'twists', 'acting', 'Cillian', 'Murphy', 'Emily', 'Blunt', 'notch', 'Produced', 'Universal', 'Pictures', 'film', 'explores', 'themes', 'time', 'identity', 'loved', 'cinematography', 'Hoyte', 'van', 'Hoytema', 'ending', 'felt', 'bit', 'rushed', 'Overall', 'good', 'watch', 'fans', 'mystery', 'thrillers']


### c) Lemmatization and d)POS


In [33]:

lemma_data = []

for token in doc:
    if token.is_alpha:
        pos = token.pos_
        lemma = token.lemma_

        # If it's a verb or noun, use its lemma; else fallback
        if pos in ["VERB", "NOUN"]:
            lemma_data.append((token.text, lemma, pos))
        else:
            lemma_data.append((token.text, lemma, pos))

df_lemmas = pd.DataFrame(lemma_data[:15], columns=["Original", "Lemmatized", "POS"])
df_lemmas



Unnamed: 0,Original,Lemmatized,POS
0,Christopher,Christopher,PROPN
1,Nolan,Nolan,PROPN
2,new,new,ADJ
3,movie,movie,NOUN
4,was,be,AUX
5,a,a,DET
6,thrilling,thrilling,NOUN
7,experience,experience,NOUN
8,It,it,PRON
9,premiered,premiere,VERB


### e)Entities


In [34]:
entities = [(ent.text, ent.label_) for ent in doc.ents]
df_entities = pd.DataFrame(entities, columns=["Entity", "Label"])
df_entities


Unnamed: 0,Entity,Label
0,Christopher Nolan's,PERSON
1,Los Angeles,GPE
2,"July 5, 2024",DATE
3,Cillian Murphy,PERSON
4,Emily Blunt,PERSON
5,Universal Pictures,ORG
6,Hoyte van Hoytema,PERSON
