In [1]:
!pip install numpy pandas nltk 



In [2]:
!pip install regex
!pip install contractions



#### Loading Dataset


In [3]:
import os
file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))

File exists: True


In [4]:
import pandas as pd
df_r=pd.read_csv(file_path)

In [5]:
df_r.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [6]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [7]:
df_r.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

In [8]:
df_r['label'].value_counts()

label
CG    20216
OR    20216
Name: count, dtype: int64

In [9]:
import re

In [10]:
df_r.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

#### Pre Pre-processing


In [11]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)


In [12]:
test_texts = [
    "This product is amazing",
    "THIS PRODUCT IS AMAZING",
    "Amazing Product!!! MUST BUY",
    "bAd",
    "",
    "OKAY"
]
for text in test_texts:
    print(f"Text: {text}")
    print("Capital Ratio:", capital_letter_ratio(text))
    print("-" * 40)

Text: This product is amazing
Capital Ratio: 0.05
----------------------------------------
Text: THIS PRODUCT IS AMAZING
Capital Ratio: 1.0
----------------------------------------
Text: Amazing Product!!! MUST BUY
Capital Ratio: 0.42857142857142855
----------------------------------------
Text: bAd
Capital Ratio: 0.3333333333333333
----------------------------------------
Text: 
Capital Ratio: 0.0
----------------------------------------
Text: OKAY
Capital Ratio: 1.0
----------------------------------------


In [13]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

Unnamed: 0,text_,capital_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.070175
1,"love it, a great upgrade from the original. I...",0.016393
2,This pillow saved my back. I love the look and...,0.038462
3,"Missing information on how to use it, but it i...",0.032258
4,Very nice set. Good quality. We have had the s...,0.045455
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0
6,They are the perfect touch for me and the only...,0.028571
7,These done fit well and look great. I love th...,0.029851
8,"Great big numbers & easy to read, the only thi...",0.032787
9,My son loves this comforter and it is very wel...,0.035088


In [14]:
def punctuation_count(text):
    if not isinstance(text, str):
        return 0

    return len(re.findall(r"[^\w\s]", text))

In [15]:
df_r['punctuation_count'] = df_r['text_'].apply(punctuation_count)
df_r[['text_', 'punctuation_count']].head(10)

Unnamed: 0,text_,punctuation_count
0,"Love this! Well made, sturdy, and very comfor...",5
1,"love it, a great upgrade from the original. I...",3
2,This pillow saved my back. I love the look and...,2
3,"Missing information on how to use it, but it i...",2
4,Very nice set. Good quality. We have had the s...,2
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1
6,They are the perfect touch for me and the only...,1
7,These done fit well and look great. I love th...,1
8,"Great big numbers & easy to read, the only thi...",3
9,My son loves this comforter and it is very wel...,1


In [16]:
def excessive_punctuation_score(text):
    if not isinstance(text, str):
        return 0
    matches = re.findall(r"[!?]{2,}", text)
    return len(matches)

In [17]:
df_r['excessive_punctuation'] = df_r['text_'].apply(excessive_punctuation_score)
df_r[['text_', 'excessive_punctuation']].head(10)

Unnamed: 0,text_,excessive_punctuation
0,"Love this! Well made, sturdy, and very comfor...",0
1,"love it, a great upgrade from the original. I...",0
2,This pillow saved my back. I love the look and...,0
3,"Missing information on how to use it, but it i...",0
4,Very nice set. Good quality. We have had the s...,0
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,0
6,They are the perfect touch for me and the only...,0
7,These done fit well and look great. I love th...,0
8,"Great big numbers & easy to read, the only thi...",0
9,My son loves this comforter and it is very wel...,0


In [18]:
df_r[['text_', 'capital_ratio', 'punctuation_count', 'excessive_punctuation']].head(10)

Unnamed: 0,text_,capital_ratio,punctuation_count,excessive_punctuation
0,"Love this! Well made, sturdy, and very comfor...",0.070175,5,0
1,"love it, a great upgrade from the original. I...",0.016393,3,0
2,This pillow saved my back. I love the look and...,0.038462,2,0
3,"Missing information on how to use it, but it i...",0.032258,2,0
4,Very nice set. Good quality. We have had the s...,0.045455,2,0
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0,1,0
6,They are the perfect touch for me and the only...,0.028571,1,0
7,These done fit well and look great. I love th...,0.029851,1,0
8,"Great big numbers & easy to read, the only thi...",0.032787,3,0
9,My son loves this comforter and it is very wel...,0.035088,1,0


#### Pre-processing


In [19]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [20]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

BEFORE: I don't like this product
AFTER : I do not like this product
----------------------------------------
BEFORE: It's not what I've expected
AFTER : It is not what I have expected
----------------------------------------
BEFORE: You're going to love it
AFTER : You are going to love it
----------------------------------------
BEFORE: They can't believe it's true
AFTER : They cannot believe it is true
----------------------------------------
BEFORE: This is fine
AFTER : This is fine
----------------------------------------


In [21]:
# cleaning text -lowercase, url, html tags,numbers ,punctiation,whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    text_ = re.sub(r'\d+', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [22]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [23]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [24]:
df_r[['text_', 'expanded_text', 'clean_text']].head(5)

Unnamed: 0,text_,expanded_text,clean_text
0,"Love this! Well made, sturdy, and very comfor...","Love this! Well made, sturdy, and very comfor...",love this well made sturdy and very comfortabl...
1,"love it, a great upgrade from the original. I...","love it, a great upgrade from the original. I...",love it a great upgrade from the original i ha...
2,This pillow saved my back. I love the look and...,This pillow saved my back. I love the look and...,this pillow saved my back i love the look and ...
3,"Missing information on how to use it, but it i...","Missing information on how to use it, but it i...",missing information on how to use it but it is...
4,Very nice set. Good quality. We have had the s...,Very nice set. Good quality. We have had the s...,very nice set good quality we have had the set...


In [25]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)


INPUT : WOW!!! 10/10 would buy again!!! üòç
OUTPUT: wow would buy again
------------------------------
INPUT : <p>Best product ever</p>
OUTPUT: best product ever
------------------------------
INPUT : Visit http://example.com NOW
OUTPUT: visit now
------------------------------
INPUT :    Multiple     spaces   
OUTPUT: multiple spaces
------------------------------
INPUT : None
OUTPUT: 
------------------------------


In [26]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [27]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [28]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    tokens = wordpunct_tokenize(text_)
    pos_tags = pos_tag(tokens)
    
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]
    
    return " ".join(lemmatized_words)


In [29]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)


BEFORE: running faster than others
AFTER : run faster than others
----------------------------------------
BEFORE: better products were bought
AFTER : well product be buy
----------------------------------------
BEFORE: he was buying expensive items
AFTER : he be buy expensive item
----------------------------------------


In [30]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [33]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text,lemmatized_text
28359,I was given an ARC for a honest review. Two wo...,I was given an ARC for a honest review. Two wo...,i was given an arc for a honest review two wor...,i be give an arc for a honest review two word ...
14389,"this movie was hilarious, my friends and i cou...","this movie was hilarious, my friends and i cou...",this movie was hilarious my friends and i coul...,this movie be hilarious my friend and i could ...
27078,The first book in this series has you wonderin...,The first book in this series has you wonderin...,the first book in this series has you wonderin...,the first book in this series have you wonder ...
27680,Persuasion is book 3 of the series and the fir...,Persuasion is book 3 of the series and the fir...,persuasion is book of the series and the first...,persuasion be book of the series and the first...
28414,"I love this series, and this book was an emoti...","I love this series, and this book was an emoti...",i love this series and this book was an emotio...,i love this series and this book be an emotion...


In [34]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [35]:
df_r.columns

Index(['category', 'rating', 'label', 'text_', 'capital_ratio',
       'punctuation_count', 'excessive_punctuation', 'expanded_text',
       'clean_text', 'review'],
      dtype='object')

In [37]:
df_r['text_length'] = df_r['review'].str.split().str.len()

#### Preprocessed dataset

In [41]:
pre_df = df_r[['text_', 'clean_text', 'review', 'label', 'text_length']]

# Save as CSV
pre_df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as CSV!")

Preprocessed dataset saved as CSV!
