## LAB EXERCISE - 1 - IMDB MOVIE REVIEWS


In [None]:
# Various NLP preprocessing tasks
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
import spacy

from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.wsd import lesk
from nltk.chunk import ne_chunk
from nltk.tree import Tree


# Download required NLTK resources
nltk.download('all')

In [None]:
# Remove punctuation and numbers
df['num_review'] = df['html_review'].apply(lambda x: re.sub("[^a-zA-Z.]", " ", x))
#df['num_review'] = df['html_review'].apply(lambda x: re.sub("[^a-zA-Z.]", " ", x)) not removing .

print("Original Review:\n", df['html_review'].iloc[0])
print("\nCleaned (No punctuation/numbers):\n", df['num_review'].iloc[0])

In [None]:
# to lowercase to ensure uniformity and to facilitate case-insensitive comparisons.
df['low_review'] = df['num_review'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text().lower())
print(df['low_review'].iloc[0])

In [None]:
# Sentence Segmentation
df['sentences'] = df['low_review'].apply(lambda x: sent_tokenize(x))
print(df['sentences'].iloc[0])

In [None]:
# Tokenization
df['tokens'] = df['sentences'].apply(lambda x: word_tokenize(" ".join(x)))
print(df['tokens'].iloc[0])

In [None]:
# Stop Word Removal
stop_words = set(stopwords.words('english'))

df['tokens_no_stopwords'] = df['tokens'].apply(lambda x: [t for t in x if t not in stop_words and t not in string.punctuation])

print(df['tokens_no_stopwords'].iloc[0])

In [None]:
# Stemming
stemmer = PorterStemmer()
df['stemmed_tokens'] = df['tokens_no_stopwords'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])
print(df['stemmed_tokens'].iloc[0])

In [None]:
# Part-of-Speech Tagging
df['pos_tags'] = df['tokens_no_stopwords'].apply(pos_tag)
print(df['pos_tags'].iloc[0])

In [None]:
# Word Sense Disambiguation
target_word = 'movie'

# Apply Lesk WSD across all reviews
df[f'{target_word}_sense'] = df['tokens_no_stopwords'].apply(
    lambda tokens: lesk(tokens, target_word).definition() if target_word in tokens and lesk(tokens, target_word) else 'No sense'
)

# Example output
print(df[[f'{target_word}_sense']].head())

In [None]:
# Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

# NER using spacy
def spacy_ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['named_entities'] = df['html_review'].apply(spacy_ner)
print(df['named_entities'].iloc[0])

In [None]:
# WordCloud
text = " ".join(review for review in df['html_review'])
wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white").generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()