In [56]:
%pip install numpy pandas nltk spacy regex contractions scikit-learn 
 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
import os
import pandas as pd
import re

#### Loading Dataset


In [58]:

file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))
df_r=pd.read_csv(file_path)


File exists: True


In [59]:
df_r.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [60]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [61]:
df_r.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

In [62]:
df_r['text_'].duplicated().sum()

np.int64(20)

In [63]:
df_r = df_r.drop_duplicates(subset='text_').reset_index(drop=True)

In [64]:
df_r['text_'].duplicated().sum()

np.int64(0)

In [65]:
df_r['label'].value_counts()

label
OR    20215
CG    20197
Name: count, dtype: int64

In [66]:
df_r.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

#### Pre Pre-processing


In [67]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

In [68]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

Unnamed: 0,text_,capital_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.070175
1,"love it, a great upgrade from the original. I...",0.016393
2,This pillow saved my back. I love the look and...,0.038462
3,"Missing information on how to use it, but it i...",0.032258
4,Very nice set. Good quality. We have had the s...,0.045455
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0
6,They are the perfect touch for me and the only...,0.028571
7,These done fit well and look great. I love th...,0.029851
8,"Great big numbers & easy to read, the only thi...",0.032787
9,My son loves this comforter and it is very wel...,0.035088


In [69]:
def punctuation_ratio(text_):
    if not isinstance(text_, str) or len(text_) == 0:
        return 0.0

    punct_count = len(re.findall(r"[^\w\s]", text_))
    return punct_count / len(text_)

In [70]:
df_r['punctuation_ratio'] = df_r['text_'].apply(punctuation_ratio)
df_r[['text_', 'punctuation_ratio']].sample(10)

Unnamed: 0,text_,punctuation_ratio
12097,They don't make 'em like that. The movie is ju...,0.050633
24059,Was totally surprised that there was an entire...,0.010204
18720,This is the third generation of this device an...,0.024922
22610,The store-n-feed is perfect for keeping my pup...,0.026316
28306,Damien and Nikki are happily married and livin...,0.029294
4727,"They fit my Rock island, and the quality is go...",0.025974
37686,Great flip flop! Ive had 4 pair and talked my...,0.010309
2326,The product is just as described and sturdily ...,0.025532
10438,Sits on my desk under my desk and the cord is ...,0.026201
9305,The sd case is durable and securely holds the ...,0.013514


In [71]:
def is_excessive_punctuation(text_, threshold=0.1):
    """
    Returns True if punctuation ratio exceeds the threshold, else False.
    
    Parameters:
    - text_: input string
    - threshold: ratio above which punctuation is considered excessive (default 0.1 = 10%)
    """
    ratio = punctuation_ratio(text_)
    return ratio > threshold

In [72]:

df_r['is_excessive_punctuation'] = df_r['text_'].apply(is_excessive_punctuation)
df_r[['text_', 'is_excessive_punctuation']].sample(10)

Unnamed: 0,text_,is_excessive_punctuation
35890,So the balloons do fill up quickly as advertis...,False
38047,When I first put them on I thought they were a...,False
27745,Selena's whole family has been affected by the...,False
16335,Received two of the left hand. Form doesn't a...,False
29049,Intriguing characters and an interesting plot....,False
39159,Perfect! It's exactly what I wanted and he lo...,False
37198,I have bought Ultra Club shirts for years. Th...,False
4762,Sent these to family members that are Greenbay...,False
18537,These are beautiful. We just built a new home...,False
27495,"After the many crimes reported, the family get...",False


In [73]:
df_r[['text_', 'capital_ratio', 'punctuation_ratio', 'is_excessive_punctuation']].sample(10)

Unnamed: 0,text_,capital_ratio,punctuation_ratio,is_excessive_punctuation
20302,"Great quality and colors, easy to use quick-re...",0.022222,0.04386,False
36887,I can wear light socks with them wishing I got...,0.05,0.018519,False
34023,I got this just for the minifigures. The Tree ...,0.031579,0.02459,False
39259,this sneaker runs small in the toe box. I've ...,0.018072,0.031532,False
28189,This is the first book in a series by the auth...,0.036325,0.02,False
2910,We saw and used these while at a resort in Pun...,0.032468,0.028278,False
27750,"I did not care for this book, as it was too sh...",0.030928,0.024032,False
30837,What a thing to have in a book. I've read a l...,0.024752,0.026515,False
11165,This is the first time I have had an external ...,0.049861,0.014768,False
30420,A history lesson that's hard to put down. Not...,0.034014,0.031414,False


#### Pre-processing


In [74]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [75]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

BEFORE: I don't like this product
AFTER : I do not like this product
----------------------------------------
BEFORE: It's not what I've expected
AFTER : It is not what I have expected
----------------------------------------
BEFORE: You're going to love it
AFTER : You are going to love it
----------------------------------------
BEFORE: They can't believe it's true
AFTER : They cannot believe it is true
----------------------------------------
BEFORE: This is fine
AFTER : This is fine
----------------------------------------


In [76]:
# cleaning text - lowercase, url, html tags, punctiation, whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [77]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [78]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [79]:
df_r[['text_', 'expanded_text', 'clean_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text
38099,"I really liked this ring, it has the wide widt...","I really liked this ring, it has the wide widt...",i really liked this ring it has the wide width...
1229,I collect angels so have a bunch of them. I a...,I collect angels so have a bunch of them. I a...,i collect angels so have a bunch of them i als...
24305,The storyline were great. The characters are h...,The storyline were great. The characters are h...,the storyline were great the characters are he...
20704,We have the dispenser for our puppy and she lo...,We have the dispenser for our puppy and she lo...,we have the dispenser for our puppy and she lo...
12712,Good effects and some decent acting.\nI enjoye...,Good effects and some decent acting.\nI enjoye...,good effects and some decent acting i enjoyed ...


In [80]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)

INPUT : WOW!!! 10/10 would buy again!!! üòç
OUTPUT: wow would buy again
------------------------------
INPUT : <p>Best product ever</p>
OUTPUT: best product ever
------------------------------
INPUT : Visit http://example.com NOW
OUTPUT: visit now
------------------------------
INPUT :    Multiple     spaces   
OUTPUT: multiple spaces
------------------------------
INPUT : None
OUTPUT: 
------------------------------


In [81]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [82]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [83]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [84]:
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0

In [85]:
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [86]:
df_r[['clean_text', 'adjective_ratio']].head()

Unnamed: 0,clean_text,adjective_ratio
0,love this well made sturdy and very comfortabl...,0.083333
1,love it a great upgrade from the original i ha...,0.117647
2,this pillow saved my back i love the look and ...,0.0
3,missing information on how to use it but it is...,0.058824
4,very nice set good quality we have had the set...,0.111111


In [87]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

In [88]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']

In [89]:
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [90]:
df_r[['clean_text', 'sentiment_score']].head(10)

Unnamed: 0,clean_text,sentiment_score
0,love this well made sturdy and very comfortabl...,0.9538
1,love it a great upgrade from the original i ha...,0.891
2,this pillow saved my back i love the look and ...,0.7906
3,missing information on how to use it but it is...,0.7227
4,very nice set good quality we have had the set...,0.7397
5,i wanted different flavors but they are not,0.0
6,they are the perfect touch for me and the only...,0.7506
7,these done fit well and look great i love the ...,0.9169
8,great big numbers easy to read the only thing ...,0.7087
9,my son loves this comforter and it is very wel...,0.858


In [91]:
df_r['text_length'] = df_r['clean_text'].str.split().str.len()

In [92]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [93]:
import pandas as pd

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    doc = nlp(text_)
    
    lemmatized_words = [
        token.lemma_
        for token in doc
        if not token.is_space
    ]
    
    return " ".join(lemmatized_words)

In [94]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)

BEFORE: running faster than others
AFTER : run fast than other
----------------------------------------
BEFORE: better products were bought
AFTER : well product be buy
----------------------------------------
BEFORE: he was buying expensive items
AFTER : he be buy expensive item
----------------------------------------


In [95]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [96]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text,lemmatized_text
21062,My cat will play for about 30 seconds and leav...,My cat will play for about 30 seconds and leav...,my cat will play for about seconds and leaves ...,my cat will play for about second and leave it...
13170,Thought provoking. Excellent acting. The mov...,Thought provoking. Excellent acting. The mov...,thought provoking excellent acting the movie i...,think provoke excellent act the movie be a goo...
1599,I placed this order on a whim and thought it w...,I placed this order on a whim and thought it w...,i placed this order on a whim and thought it w...,I place this order on a whim and think it woul...
18899,The seat is easy to install and aesthetically ...,The seat is easy to install and aesthetically ...,the seat is easy to install and aesthetically ...,the seat be easy to install and aesthetically ...
16004,It does the job. I hope it lasts better than ...,It does the job. I hope it lasts better than ...,it does the job i hope it lasts better than my...,it do the job I hope it last well than my prev...


In [97]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [98]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=0.05,
    max_df=0.9,
    stop_words=None
)

X_tfidf = tfidf.fit_transform(df_r['review'])

In [100]:
X_tfidf.shape

(40412, 201)

In [101]:
tfidf.get_feature_names_out()[:20]

array(['about', 'after', 'all', 'also', 'an', 'and', 'and be', 'and have',
       'and it', 'and the', 'any', 'anyone', 'as', 'at', 'be', 'be bit',
       'be good', 'be great', 'be just', 'be little'], dtype=object)

In [103]:
X_tfidf                   
df_r[['adjective_ratio',
      'sentiment_score',
      'text_length',
      'capital_ratio',
      'punctuation_ratio']].head()

Unnamed: 0,adjective_ratio,sentiment_score,text_length,capital_ratio,punctuation_ratio
0,0.083333,0.9538,12,0.070175,0.066667
1,0.117647,0.891,17,0.016393,0.0375
2,0.0,0.7906,14,0.038462,0.029851
3,0.058824,0.7227,17,0.032258,0.024691
4,0.111111,0.7397,18,0.045455,0.023529


In [105]:
from scipy.sparse import hstack
X_extra = df_r[
    ['adjective_ratio',
     'sentiment_score',
     'text_length',
     'capital_ratio',
     'punctuation_ratio']
].values

In [106]:
X_extra.shape

(40412, 5)

In [107]:
X_final = hstack([X_tfidf, X_extra])

In [108]:
X_tfidf.shape
X_extra.shape
X_final.shape

(40412, 206)

#### Preprocessed dataset

In [109]:
df_r.columns

Index(['category', 'rating', 'label', 'text_', 'capital_ratio',
       'punctuation_ratio', 'is_excessive_punctuation', 'expanded_text',
       'clean_text', 'adjective_ratio', 'sentiment_score', 'text_length',
       'review'],
      dtype='object')

In [110]:
pre_df = df_r[
    ['text_','clean_text', 'review', 'rating','label', 'text_length',
     'capital_ratio', 'punctuation_ratio', 'is_excessive_punctuation',
     'adjective_ratio', 'sentiment_score']
]

# Save as CSV
pre_df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as CSV!")

Preprocessed dataset saved as CSV!
