In [28]:
%pip install numpy pandas nltk spacy regex contractions scikit-learn 
 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
import os
import pandas as pd
import re

#### Loading Dataset


In [30]:

file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))
df_r=pd.read_csv(file_path)


File exists: True


In [31]:
df_r.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [32]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [33]:
df_r.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

In [34]:
df_r['text_'].duplicated().sum()

np.int64(20)

In [35]:
df_r = df_r.drop_duplicates(subset='text_').reset_index(drop=True)

In [36]:
df_r['text_'].duplicated().sum()

np.int64(0)

In [37]:
df_r['label'].value_counts()

label
OR    20215
CG    20197
Name: count, dtype: int64

In [38]:
df_r.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

#### Pre Pre-processing


In [39]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

In [40]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

Unnamed: 0,text_,capital_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.070175
1,"love it, a great upgrade from the original. I...",0.016393
2,This pillow saved my back. I love the look and...,0.038462
3,"Missing information on how to use it, but it i...",0.032258
4,Very nice set. Good quality. We have had the s...,0.045455
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0
6,They are the perfect touch for me and the only...,0.028571
7,These done fit well and look great. I love th...,0.029851
8,"Great big numbers & easy to read, the only thi...",0.032787
9,My son loves this comforter and it is very wel...,0.035088


In [41]:
def punctuation_ratio(text_):
    if not isinstance(text_, str) or len(text_) == 0:
        return 0.0

    punct_count = len(re.findall(r"[^\w\s]", text_))
    return punct_count / len(text_)

In [42]:
df_r['punctuation_ratio'] = df_r['text_'].apply(punctuation_ratio)
df_r[['text_', 'punctuation_ratio']].sample(10)

Unnamed: 0,text_,punctuation_ratio
9557,Great sound. Much louder than I expected. The ...,0.020833
24222,This one was good. It had a lot going on but i...,0.041237
966,Super cute because it looks nice. The only pro...,0.021277
15865,Works great with my garage door opener and the...,0.021505
35545,Pretty good but quickly gets loose. I am still...,0.018182
1335,"Does the job, I unhooked them for cooking and ...",0.045455
3253,This is very good product at an exceptional pr...,0.023555
23480,I purchased this for one of my cats. It is a l...,0.025961
26201,I typically stick to the vampire paranormal ge...,0.035821
30467,Our local shelter for families in crisis is pr...,0.028226


In [43]:
def is_excessive_punctuation(text_, threshold=0.1):
    """
    Returns True if punctuation ratio exceeds the threshold, else False.
    
    Parameters:
    - text_: input string
    - threshold: ratio above which punctuation is considered excessive (default 0.1 = 10%)
    """
    ratio = punctuation_ratio(text_)
    return ratio > threshold

In [44]:

df_r['is_excessive_punctuation'] = df_r['text_'].apply(is_excessive_punctuation)
df_r[['text_', 'is_excessive_punctuation']].sample(10)

Unnamed: 0,text_,is_excessive_punctuation
24712,This book is entertainment to the family :) Ca...,False
11713,I will say that I use it as an external HDD fo...,False
30229,Although it introduces interesting ideas about...,False
13418,My husband and I both really like this old mov...,False
11272,I don't much care for having a removable lens ...,False
31563,I thought this book was a real page turner. I ...,False
37802,These are great. They are soft and comfortable...,False
22617,I hoped...and hoped...and tried this product i...,False
18343,This Plane is solid and gives an initial good ...,False
8528,Too small for a passport which is interesting....,False


In [45]:
df_r[['text_', 'capital_ratio', 'punctuation_ratio', 'is_excessive_punctuation']].sample(10)

Unnamed: 0,text_,capital_ratio,punctuation_ratio,is_excessive_punctuation
4432,"The sturdiest knife, with a blade of the same ...",0.014706,0.023529,False
4950,Great slippers. Have used them for a couple of...,0.041096,0.021739,False
26975,I bought this book at Christmas this year to f...,0.024938,0.013807,False
1111,Solid metal materials. Pretty easy to assemble...,0.041237,0.032787,False
6272,Gloves work ok but they are a little wide and ...,0.036765,0.022099,False
11275,"I bought this to replace a bulky, wide-angle l...",0.080745,0.019231,False
8057,I like the blue lights and it works really well.,0.026316,0.020833,False
3108,"People beware! First off, this is NOT an oil ...",0.035857,0.03003,False
21154,My young dogs LOVE these. They have saved coun...,0.053435,0.018293,False
16321,Put 2 of these in the new kitchen exhaust hood...,0.027778,0.031915,False


#### Pre-processing


In [46]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [47]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

BEFORE: I don't like this product
AFTER : I do not like this product
----------------------------------------
BEFORE: It's not what I've expected
AFTER : It is not what I have expected
----------------------------------------
BEFORE: You're going to love it
AFTER : You are going to love it
----------------------------------------
BEFORE: They can't believe it's true
AFTER : They cannot believe it is true
----------------------------------------
BEFORE: This is fine
AFTER : This is fine
----------------------------------------


In [48]:
# cleaning text - lowercase, url, html tags, punctiation, whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [49]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [50]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [51]:
df_r[['text_', 'expanded_text', 'clean_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text
29573,"What a great love story, and an entertaining r...","What a great love story, and an entertaining r...",what a great love story and an entertaining re...
5293,"built may rivet builds, decided to try a screw...","built may rivet builds, decided to try a screw...",built may rivet builds decided to try a screw ...
37820,Shoe is at least one size larger than the othe...,Shoe is at least one size larger than the othe...,shoe is at least one size larger than the othe...
12222,"Awesome series, I've watched it more than once...","Awesome series, I have watched it more than on...",awesome series i have watched it more than onc...
22837,"I'm sure it is great, but I wouldn't recommend...","I am sure it is great, but I would not recomme...",i am sure it is great but i would not recommen...


In [52]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)

INPUT : WOW!!! 10/10 would buy again!!! üòç
OUTPUT: wow would buy again
------------------------------
INPUT : <p>Best product ever</p>
OUTPUT: best product ever
------------------------------
INPUT : Visit http://example.com NOW
OUTPUT: visit now
------------------------------
INPUT :    Multiple     spaces   
OUTPUT: multiple spaces
------------------------------
INPUT : None
OUTPUT: 
------------------------------


In [53]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [54]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [55]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [56]:
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0

In [None]:
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [None]:
df_r[['clean_text', 'adjective_ratio']].head()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']

In [None]:
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [None]:
df_r[['clean_text', 'sentiment_score']].head(10)

In [None]:
df_r['text_length'] = df_r['clean_text'].str.split().str.len()

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [None]:
import pandas as pd

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    doc = nlp(text_)
    
    lemmatized_words = [
        token.lemma_
        for token in doc
        if not token.is_space
    ]
    
    return " ".join(lemmatized_words)

In [None]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)

In [None]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [None]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

In [None]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [None]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=0.05,
    max_df=0.9,
    stop_words=None
)

X_tfidf = tfidf.fit_transform(df_r['review'])

In [None]:
X_tfidf.shape

In [None]:
tfidf.get_feature_names_out()[:20]

In [None]:
X_tfidf                   
df_r[['adjective_ratio',
      'sentiment_score',
      'text_length',
      'capital_ratio',
      'punctuation_count']].head()

In [None]:
from scipy.sparse import hstack
X_extra = df_r[
    ['adjective_ratio',
     'sentiment_score',
     'text_length',
     'capital_ratio',
     'punctuation_count']
].values

In [None]:
X_extra.shape

In [None]:
X_final = hstack([X_tfidf, X_extra])

In [None]:
X_tfidf.shape
X_extra.shape
X_final.shape

#### Preprocessed dataset

In [None]:
df_r.columns

In [None]:
pre_df = df_r[
    ['text_','clean_text', 'review', 'rating','label', 'text_length',
     'capital_ratio', 'punctuation_ratio', 'is_excessive_punctuation',
     'adjective_ratio', 'sentiment_score']
]

# Save as CSV
pre_df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as CSV!")