In [1]:
!pip install numpy pandas nltk 



In [2]:
!pip install regex
!pip install contractions
!pip install scikit-learn



#### Loading Dataset


In [3]:
import os
file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))

File exists: True


In [4]:
import pandas as pd
df_r=pd.read_csv(file_path)

In [5]:
df_r.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [6]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [7]:
df_r.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

In [8]:
df_r['text_'].duplicated().sum()

np.int64(20)

In [9]:
df_r = df_r.drop_duplicates(subset='text_').reset_index(drop=True)

In [10]:
df_r['text_'].duplicated().sum()

np.int64(0)

In [11]:
df_r['label'].value_counts()

label
OR    20215
CG    20197
Name: count, dtype: int64

In [12]:
import re

In [13]:
df_r.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

#### Pre Pre-processing


In [14]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

In [15]:
test_texts = [
    "This product is amazing",
    "THIS PRODUCT IS AMAZING",
    "Amazing Product!!! MUST BUY",
    "bAd",
    "",
    "OKAY"
]
for text in test_texts:
    print(f"Text: {text}")
    print("Capital Ratio:", capital_letter_ratio(text))
    print("-" * 40)

Text: This product is amazing
Capital Ratio: 0.05
----------------------------------------
Text: THIS PRODUCT IS AMAZING
Capital Ratio: 1.0
----------------------------------------
Text: Amazing Product!!! MUST BUY
Capital Ratio: 0.42857142857142855
----------------------------------------
Text: bAd
Capital Ratio: 0.3333333333333333
----------------------------------------
Text: 
Capital Ratio: 0.0
----------------------------------------
Text: OKAY
Capital Ratio: 1.0
----------------------------------------


In [16]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

Unnamed: 0,text_,capital_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.070175
1,"love it, a great upgrade from the original. I...",0.016393
2,This pillow saved my back. I love the look and...,0.038462
3,"Missing information on how to use it, but it i...",0.032258
4,Very nice set. Good quality. We have had the s...,0.045455
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0
6,They are the perfect touch for me and the only...,0.028571
7,These done fit well and look great. I love th...,0.029851
8,"Great big numbers & easy to read, the only thi...",0.032787
9,My son loves this comforter and it is very wel...,0.035088


In [17]:
def punctuation_count(text):
    if not isinstance(text, str):
        return 0

    return len(re.findall(r"[^\w\s]", text))

In [18]:
df_r['punctuation_count'] = df_r['text_'].apply(punctuation_count)
df_r[['text_', 'punctuation_count']].head(10)

Unnamed: 0,text_,punctuation_count
0,"Love this! Well made, sturdy, and very comfor...",5
1,"love it, a great upgrade from the original. I...",3
2,This pillow saved my back. I love the look and...,2
3,"Missing information on how to use it, but it i...",2
4,Very nice set. Good quality. We have had the s...,2
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1
6,They are the perfect touch for me and the only...,1
7,These done fit well and look great. I love th...,1
8,"Great big numbers & easy to read, the only thi...",3
9,My son loves this comforter and it is very wel...,1


In [19]:
def excessive_punctuation_score(text):
    if not isinstance(text, str):
        return 0
    matches = re.findall(r"[!?]{2,}", text)
    return len(matches)

In [20]:
df_r['excessive_punctuation'] = df_r['text_'].apply(excessive_punctuation_score)
df_r[['text_', 'excessive_punctuation']].sample(10)

Unnamed: 0,text_,excessive_punctuation
11569,"but, then, I realized that it has no memory ex...",0
14013,Actually am a little disappointed that it didn...,0
19741,Dogs love these treats - especially the suds. ...,0
25018,I Iove the unique ways that an author can weav...,0
19152,"Considering all the tools in my garage, this p...",0
24274,I loved reading this book! I couldn't put it ...,0
33406,Bought 2 for Christmas presents. They are ver...,0
12876,I sat through the entire run of Angel but this...,0
33986,My 3 year old loved this set and the materials...,0
19062,Great lights. However the 18650 batteries WILL...,0


In [21]:
df_r[['text_', 'capital_ratio', 'punctuation_count', 'excessive_punctuation']].head(10)

Unnamed: 0,text_,capital_ratio,punctuation_count,excessive_punctuation
0,"Love this! Well made, sturdy, and very comfor...",0.070175,5,0
1,"love it, a great upgrade from the original. I...",0.016393,3,0
2,This pillow saved my back. I love the look and...,0.038462,2,0
3,"Missing information on how to use it, but it i...",0.032258,2,0
4,Very nice set. Good quality. We have had the s...,0.045455,2,0
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0,1,0
6,They are the perfect touch for me and the only...,0.028571,1,0
7,These done fit well and look great. I love th...,0.029851,1,0
8,"Great big numbers & easy to read, the only thi...",0.032787,3,0
9,My son loves this comforter and it is very wel...,0.035088,1,0


#### Pre-processing


In [22]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [23]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

BEFORE: I don't like this product
AFTER : I do not like this product
----------------------------------------
BEFORE: It's not what I've expected
AFTER : It is not what I have expected
----------------------------------------
BEFORE: You're going to love it
AFTER : You are going to love it
----------------------------------------
BEFORE: They can't believe it's true
AFTER : They cannot believe it is true
----------------------------------------
BEFORE: This is fine
AFTER : This is fine
----------------------------------------


In [24]:
# cleaning text - lowercase, url, html tags, punctiation, whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [25]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [26]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [27]:
df_r[['text_', 'expanded_text', 'clean_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text
20396,This is awesome! It's very soft and easy to pu...,This is awesome! It is very soft and easy to p...,this is awesome it is very soft and easy to pu...
19341,"After reading reviews on these, I decided to t...","After reading reviews on these, I decided to t...",after reading reviews on these i decided to tr...
39836,"I really, really loved these. The quality is ...","I really, really loved these. The quality is ...",i really really loved these the quality is jus...
2991,Con-Tact paper has come a long way since my ea...,Con-Tact paper has come a long way since my ea...,contact paper has come a long way since my ear...
14543,But I couldn't stop watching. I kept hoping t...,But I could not stop watching. I kept hoping ...,but i could not stop watching i kept hoping th...


In [28]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)


INPUT : WOW!!! 10/10 would buy again!!! üòç
OUTPUT: wow would buy again
------------------------------
INPUT : <p>Best product ever</p>
OUTPUT: best product ever
------------------------------
INPUT : Visit http://example.com NOW
OUTPUT: visit now
------------------------------
INPUT :    Multiple     spaces   
OUTPUT: multiple spaces
------------------------------
INPUT : None
OUTPUT: 
------------------------------


In [29]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [31]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [32]:
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0

In [33]:
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [34]:
df_r[['clean_text', 'adjective_ratio']].head()

Unnamed: 0,clean_text,adjective_ratio
0,love this well made sturdy and very comfortabl...,0.083333
1,love it a great upgrade from the original i ha...,0.117647
2,this pillow saved my back i love the look and ...,0.0
3,missing information on how to use it but it is...,0.058824
4,very nice set good quality we have had the set...,0.111111


In [35]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

In [36]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']

In [37]:
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [38]:
df_r[['clean_text', 'sentiment_score']].head(10)

Unnamed: 0,clean_text,sentiment_score
0,love this well made sturdy and very comfortabl...,0.9538
1,love it a great upgrade from the original i ha...,0.891
2,this pillow saved my back i love the look and ...,0.7906
3,missing information on how to use it but it is...,0.7227
4,very nice set good quality we have had the set...,0.7397
5,i wanted different flavors but they are not,0.0
6,they are the perfect touch for me and the only...,0.7506
7,these done fit well and look great i love the ...,0.9169
8,great big numbers easy to read the only thing ...,0.7087
9,my son loves this comforter and it is very wel...,0.858


In [39]:
df_r['text_length'] = df_r['clean_text'].str.split().str.len()

In [40]:
df_r[['clean_text', 'text_length']].head()

Unnamed: 0,clean_text,text_length
0,love this well made sturdy and very comfortabl...,12
1,love it a great upgrade from the original i ha...,17
2,this pillow saved my back i love the look and ...,14
3,missing information on how to use it but it is...,17
4,very nice set good quality we have had the set...,18


In [41]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    tokens = wordpunct_tokenize(text_)
    pos_tags = pos_tag(tokens)
    
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]
    
    return " ".join(lemmatized_words)


In [42]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)

BEFORE: running faster than others
AFTER : run faster than others
----------------------------------------
BEFORE: better products were bought
AFTER : well product be buy
----------------------------------------
BEFORE: he was buying expensive items
AFTER : he be buy expensive item
----------------------------------------


In [43]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [44]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text,lemmatized_text
7112,Pros: Light and durable. won't hurt if get b...,Pros: Light and durable. will not hurt if ge...,pros light and durable will not hurt if get be...,pro light and durable will not hurt if get bea...
26663,"Great story, with lovable characters, both mai...","Great story, with lovable characters, both mai...",great story with lovable characters both main ...,great story with lovable character both main a...
22931,I have ordered many tubes of this and the mate...,I have ordered many tubes of this and the mate...,i have ordered many tubes of this and the mate...,i have order many tube of this and the materia...
7582,I purchased this holster for my 9mm Glock 17. ...,I purchased this holster for my 9mm Glock 17. ...,i purchased this holster for my mm glock i als...,i purchase this holster for my mm glock i also...
18838,I have not purchased a single set of these too...,I have not purchased a single set of these too...,i have not purchased a single set of these too...,i have not purchase a single set of these tool...


In [45]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [46]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X_tfidf = tfidf.fit_transform(df_r['review'])

In [48]:
X_tfidf.shape

(40412, 5000)

In [49]:
tfidf.get_feature_names_out()[:20]

array(['aa', 'ability', 'able', 'able read', 'able use', 'abrupt',
       'absolute', 'absolutely', 'absolutely love', 'absorb', 'absorbent',
       'abuse', 'ac', 'academic', 'accent', 'accept', 'access',
       'accessory', 'accident', 'accidentally'], dtype=object)

In [50]:
X_tfidf                   
df_r[['adjective_ratio',
      'sentiment_score',
      'text_length',
      'capital_ratio',
      'punctuation_count']].head()

Unnamed: 0,adjective_ratio,sentiment_score,text_length,capital_ratio,punctuation_count
0,0.083333,0.9538,12,0.070175,5
1,0.117647,0.891,17,0.016393,3
2,0.0,0.7906,14,0.038462,2
3,0.058824,0.7227,17,0.032258,2
4,0.111111,0.7397,18,0.045455,2


In [51]:
from scipy.sparse import hstack
X_extra = df_r[
    ['adjective_ratio',
     'sentiment_score',
     'text_length',
     'capital_ratio',
     'punctuation_count']
].values

In [52]:
X_extra.shape

(40412, 5)

In [53]:
X_final = hstack([X_tfidf, X_extra])

In [54]:
X_tfidf.shape
X_extra.shape
X_final.shape

(40412, 5005)

#### Preprocessed dataset

In [56]:
df_r.columns

Index(['category', 'rating', 'label', 'text_', 'capital_ratio',
       'punctuation_count', 'excessive_punctuation', 'expanded_text',
       'clean_text', 'adjective_ratio', 'sentiment_score', 'text_length',
       'review'],
      dtype='object')

In [55]:
# pre_df = df_r[
#     ['rating', 'review', 'label', 'text_length',
#      'capital_ratio', 'punctuation_count', 'excessive_punctuation']
# ]

# # Save as CSV
# pre_df.to_csv("preprocessed_dataset.csv", index=False)
# print("Preprocessed dataset saved as CSV!")