In [1]:
import nltk
import spacy
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import re
import random
import pandas as pd

# Download required NLTK data files
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/ali/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /home/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ali/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#!wget https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
#!mkdir reuters21578
#!tar -xzvf reuters21578.tar.gz -C ./reuters21578
#!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#!mkdir aclImdb_v1
#!tar -xzvf aclImdb_v1.tar.gz -C ./aclImdb_v1

In [3]:
# Load IMDb Movie Reviews from NLTK
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Convert to DataFrame for easier handling
df = pd.DataFrame(documents, columns=['words', 'sentiment'])

# Combine words back into a single string
df['text'] = df['words'].apply(lambda x: ' '.join(x))

# Display the first few entries
df.head()


Unnamed: 0,words,sentiment,text
0,"[well, ,, here, ', s, a, distasteful, ,, thoro...",neg,"well , here ' s a distasteful , thoroughly ama..."
1,"[an, attempt, at, florida, film, noir, ,, palm...",neg,"an attempt at florida film noir , palmetto fai..."
2,"[by, trying, to, satisfy, every, kind, of, vie...",neg,"by trying to satisfy every kind of viewer , it..."
3,"[as, a, revolutionary, war, hero, in, the, pat...",pos,"as a revolutionary war hero in the patriot , m..."
4,"["", gordy, "", is, not, a, movie, ,, it, is, a,...",neg,""" gordy "" is not a movie , it is a 90 - minute..."


In [4]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Example
sample_text = df['text'][0]
print("Original Text:\n", sample_text[:500], "...\n")
print("Tokenized Text:\n", tokenize_text(sample_text)[:50], "...")

Original Text:
 well , here ' s a distasteful , thoroughly amateurish item that , surprisingly , was actually a box - office hit at the time of its release . after just viewing the film for the first time , my primary question is how did anyone with an iq north of 35 enjoy this movie ? it is cheap , idiotic , unfunny , and not nearly as raunchy as i had heard it was . at least some smut would have livened things up a bit . " porky ' s , " tells the story ( if you can call it that ) of four clueless high school  ...

Tokenized Text:
 ['well', ',', 'here', "'", 's', 'a', 'distasteful', ',', 'thoroughly', 'amateurish', 'item', 'that', ',', 'surprisingly', ',', 'was', 'actually', 'a', 'box', '-', 'office', 'hit', 'at', 'the', 'time', 'of', 'its', 'release', '.', 'after', 'just', 'viewing', 'the', 'film', 'for', 'the', 'first', 'time', ',', 'my', 'primary', 'question', 'is', 'how', 'did', 'anyone', 'with', 'an', 'iq', 'north'] ...


### Lemmatization/Stemming
- Lemmatization reduces words to their base or dictionary form.
- Stemming reduces words to their root form, which may not be a valid word.

In [5]:
# Initialize Lemmatizer and Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize_tokens(tokens):
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized

def stem_tokens(tokens):
    stemmed = [stemmer.stem(token) for token in tokens]
    return stemmed

# Example
tokens = tokenize_text(sample_text)
print("Lemmatized Tokens:\n", lemmatize_tokens(tokens)[:50], "...")
print("Stemmed Tokens:\n", stem_tokens(tokens)[:50], "...")

Lemmatized Tokens:
 ['well', ',', 'here', "'", 's', 'a', 'distasteful', ',', 'thoroughly', 'amateurish', 'item', 'that', ',', 'surprisingly', ',', 'wa', 'actually', 'a', 'box', '-', 'office', 'hit', 'at', 'the', 'time', 'of', 'it', 'release', '.', 'after', 'just', 'viewing', 'the', 'film', 'for', 'the', 'first', 'time', ',', 'my', 'primary', 'question', 'is', 'how', 'did', 'anyone', 'with', 'an', 'iq', 'north'] ...
Stemmed Tokens:
 ['well', ',', 'here', "'", 's', 'a', 'distast', ',', 'thoroughli', 'amateurish', 'item', 'that', ',', 'surprisingli', ',', 'wa', 'actual', 'a', 'box', '-', 'offic', 'hit', 'at', 'the', 'time', 'of', 'it', 'releas', '.', 'after', 'just', 'view', 'the', 'film', 'for', 'the', 'first', 'time', ',', 'my', 'primari', 'question', 'is', 'how', 'did', 'anyon', 'with', 'an', 'iq', 'north'] ...


### Stop Words Removal
- Stop words are common words that carry minimal meaningful information. Removing them can reduce noise in the data.

In [6]:
# Initialize Stop Words
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    filtered = [token for token in tokens if token.lower() not in stop_words]
    return filtered

# Example
print("After Stop Words Removal:\n", remove_stopwords(tokens)[:50], "...")

After Stop Words Removal:
 ['well', ',', "'", 'distasteful', ',', 'thoroughly', 'amateurish', 'item', ',', 'surprisingly', ',', 'actually', 'box', '-', 'office', 'hit', 'time', 'release', '.', 'viewing', 'film', 'first', 'time', ',', 'primary', 'question', 'anyone', 'iq', 'north', '35', 'enjoy', 'movie', '?', 'cheap', ',', 'idiotic', ',', 'unfunny', ',', 'nearly', 'raunchy', 'heard', '.', 'least', 'smut', 'would', 'livened', 'things', 'bit', '.'] ...


### Punctuation Removal
- Removing punctuation helps in cleaning the text and focusing on the actual words.

In [7]:
def remove_punctuation(tokens):
    table = str.maketrans('', '', string.punctuation)
    stripped = [token.translate(table) for token in tokens]
    # Remove tokens that are empty after stripping punctuation
    stripped = [token for token in stripped if token]
    return stripped

# Example
print("After Punctuation Removal:\n", remove_punctuation(tokens)[:50], "...")

After Punctuation Removal:
 ['well', 'here', 's', 'a', 'distasteful', 'thoroughly', 'amateurish', 'item', 'that', 'surprisingly', 'was', 'actually', 'a', 'box', 'office', 'hit', 'at', 'the', 'time', 'of', 'its', 'release', 'after', 'just', 'viewing', 'the', 'film', 'for', 'the', 'first', 'time', 'my', 'primary', 'question', 'is', 'how', 'did', 'anyone', 'with', 'an', 'iq', 'north', 'of', '35', 'enjoy', 'this', 'movie', 'it', 'is', 'cheap'] ...


## Pipeline Integration
We'll combine all the preprocessing steps into a reusable class called TextPreprocessor.

In [8]:
class TextPreprocessor:
    def __init__(self, method='lemmatize'):
        self.method = method
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        self.punctuation_table = str.maketrans('', '', string.punctuation)
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def lemmatize(self, tokens):
        return [self.lemmatizer.lemmatize(token) for token in tokens]
    
    def stem(self, tokens):
        return [self.stemmer.stem(token) for token in tokens]
    
    def remove_stopwords(self, tokens):
        return [token for token in tokens if token.lower() not in self.stop_words]
    
    def remove_punctuation(self, tokens):
        stripped = [token.translate(self.punctuation_table) for token in tokens]
        return [token for token in stripped if token]
    
    def preprocess(self, text):
        tokens = self.tokenize(text)
        tokens = self.remove_punctuation(tokens)
        tokens = self.remove_stopwords(tokens)
        if self.method == 'lemmatize':
            tokens = self.lemmatize(tokens)
        elif self.method == 'stem':
            tokens = self.stem(tokens)
        return tokens

# Initialize Preprocessor
preprocessor = TextPreprocessor(method='lemmatize')

# Apply preprocessing to the first sample
processed_tokens = preprocessor.preprocess(sample_text)
print("Processed Tokens:\n", processed_tokens[:50], "...")

Processed Tokens:
 ['well', 'distasteful', 'thoroughly', 'amateurish', 'item', 'surprisingly', 'actually', 'box', 'office', 'hit', 'time', 'release', 'viewing', 'film', 'first', 'time', 'primary', 'question', 'anyone', 'iq', 'north', '35', 'enjoy', 'movie', 'cheap', 'idiotic', 'unfunny', 'nearly', 'raunchy', 'heard', 'least', 'smut', 'would', 'livened', 'thing', 'bit', 'porky', 'tell', 'story', 'call', 'four', 'clueless', 'high', 'school', 'buddy', 'pee', 'wee', 'dan', 'monahan', 'billy'] ...


## Evaluation
We'll compare the original text with the preprocessed text to demonstrate the effectiveness of our pipeline.

In [11]:
# Function to preprocess and return text
def preprocess_text(preprocessor, text):
    tokens = preprocessor.preprocess(text)
    return ' '.join(tokens)

# Select 5 samples
samples = df.sample(5, random_state=42).reset_index(drop=True)

# Preprocess samples
samples['processed'] = samples['text'].apply(lambda x: preprocess_text(preprocessor, x))

# Display Before and After
for i in range(5):
    print(f"--- Sample {i+1} ---\n")
    print("Original Text:\n", samples['text'][i][:500], "...\n")
    print("Processed Text:\n", samples['processed'][i][:500], "...\n")
    print("\n")


--- Sample 1 ---

Original Text:
 i had a chance to see a sneak preview of city slickers ii on campus last night . i went in with the expectation of a film with the similar flavor which made the original such a success : personal growth and insightful humor about life . i came away somewhat disappointed in this regard , getting some of the latter and not much of the former . the basic plot revolves around billy crystal , who plays mitch robbins , turning forty . he ' s now become the station manager of the radio station in which ...

Processed Text:
 chance see sneak preview city slicker ii campus last night went expectation film similar flavor made original success personal growth insightful humor life came away somewhat disappointed regard getting latter much former basic plot revolves around billy crystal play mitch robbins turning forty become station manager radio station worked original given job radio station best friend played daniel stern sympathy stern character work divorce 