# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

#Importing Libararies



In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Loading Data set

In [2]:
df = pd.read_csv("/content/flipkart_product.csv", encoding='latin1', on_bad_lines='skip', engine='python')
data = df.iloc[:5000]   # take 5000 rows for speed
data.head()

Unnamed: 0,ProductName,Price,Rate,Review,Summary
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Super!,Great cooler.. excellent air flow and for this...
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,The quality is good but the power of air is de...
3,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",1,Useless product,Very bad product it's a only a fan
4,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,Ok ok product


#Data Preprocessing

In [3]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [4]:
import nltk
nltk.download('punkt_tab')

data["tokens"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[0])
data["no_stopwords"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[1])
data["stemmed"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[2])
data["lemmatized"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[3])

data.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tokens"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["no_stopwords"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

Unnamed: 0,ProductName,Price,Rate,Review,Summary,tokens,no_stopwords,stemmed,lemmatized
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Super!,Great cooler.. excellent air flow and for this...,"[candes, 12, l, room/personal, air, cooler, ?,...","[candes, l, air, cooler, ÿ, ÿ, white, black, e...","[cand, l, air, cooler, ÿ, ÿ, white, black, ele...","[candes, l, air, cooler, ÿ, ÿ, white, black, e..."
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling,"[candes, 12, l, room/personal, air, cooler, ?,...","[candes, l, air, cooler, ÿ, ÿ, white, black, e...","[cand, l, air, cooler, ÿ, ÿ, white, black, ele...","[candes, l, air, cooler, ÿ, ÿ, white, black, e..."
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,The quality is good but the power of air is de...,"[candes, 12, l, room/personal, air, cooler, ?,...","[candes, l, air, cooler, ÿ, ÿ, white, black, e...","[cand, l, air, cooler, ÿ, ÿ, white, black, ele...","[candes, l, air, cooler, ÿ, ÿ, white, black, e..."
3,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",1,Useless product,Very bad product it's a only a fan,"[candes, 12, l, room/personal, air, cooler, ?,...","[candes, l, air, cooler, ÿ, ÿ, white, black, e...","[cand, l, air, cooler, ÿ, ÿ, white, black, ele...","[candes, l, air, cooler, ÿ, ÿ, white, black, e..."
4,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,Ok ok product,"[candes, 12, l, room/personal, air, cooler, ?,...","[candes, l, air, cooler, ÿ, ÿ, white, black, e...","[cand, l, air, cooler, ÿ, ÿ, white, black, ele...","[candes, l, air, cooler, ÿ, ÿ, white, black, e..."


#Comparing Representation Quality

In [5]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
data["tokens"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[0])
data["no_stopwords"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[1])
data["stemmed"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[2])
data["lemmatized"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(data["tokens"]),
    "After Stopword Removal": get_vocab_size(data["no_stopwords"]),
    "After Stemming": get_vocab_size(data["stemmed"]),
    "After Lemmatization": get_vocab_size(data["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tokens"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["no_stopwords"] = df["ProductName"].astype(str).apply(lambda x: preprocess(x)[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["stemmed"] = df["ProductName"].a

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,125,95,94,95
