In [25]:
%pip install numpy pandas nltk spacy regex contractions scikit-learn 
 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import os
import pandas as pd
import re

#### Loading Dataset


In [27]:

file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))
df_r=pd.read_csv(file_path)


File exists: True


In [28]:
df_r.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [29]:
df_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [30]:
df_r.isnull().sum()

category    0
rating      0
label       0
text_       0
dtype: int64

In [31]:
df_r['text_'].duplicated().sum()

np.int64(20)

In [32]:
df_r = df_r.drop_duplicates(subset='text_').reset_index(drop=True)

In [33]:
df_r['text_'].duplicated().sum()

np.int64(0)

In [34]:
df_r['label'].value_counts()

label
OR    20215
CG    20197
Name: count, dtype: int64

In [35]:
df_r.columns

Index(['category', 'rating', 'label', 'text_'], dtype='object')

#### Pre Pre-processing


In [36]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

In [37]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

Unnamed: 0,text_,capital_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.070175
1,"love it, a great upgrade from the original. I...",0.016393
2,This pillow saved my back. I love the look and...,0.038462
3,"Missing information on how to use it, but it i...",0.032258
4,Very nice set. Good quality. We have had the s...,0.045455
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0
6,They are the perfect touch for me and the only...,0.028571
7,These done fit well and look great. I love th...,0.029851
8,"Great big numbers & easy to read, the only thi...",0.032787
9,My son loves this comforter and it is very wel...,0.035088


In [None]:
def punctuation_ratio(text_):
    if not isinstance(text_, str) or len(text_) == 0:
        return 0.0

    punct_count = len(re.findall(r"[^\w\s]", text_))
    return punct_count / len(text_)

In [57]:
df_r['punctuation_ratio'] = df_r['text_'].apply(punctuation_ratio)
df_r[['text_', 'punctuation_ratio']].head(10)

Unnamed: 0,text_,punctuation_ratio
0,"Love this! Well made, sturdy, and very comfor...",0.066667
1,"love it, a great upgrade from the original. I...",0.0375
2,This pillow saved my back. I love the look and...,0.029851
3,"Missing information on how to use it, but it i...",0.024691
4,Very nice set. Good quality. We have had the s...,0.023529
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,0.022727
6,They are the perfect touch for me and the only...,0.011236
7,These done fit well and look great. I love th...,0.011765
8,"Great big numbers & easy to read, the only thi...",0.037037
9,My son loves this comforter and it is very wel...,0.013514


In [63]:
def excessive_punctuation_ratio(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0.0

    matches = re.findall(r"[!?]{2,}", text)
    excessive_count = sum(len(m) for m in matches)

    return excessive_count / len(text)


def excessive_punctuation_flag(text, threshold=0.05):
    if(excessive_punctuation_ratio(text) > threshold):
        return True
    else:
        return False


In [71]:
df_r['excessive_punctuation_ratio'] = df_r['text_'].apply(
    excessive_punctuation_ratio
)

In [42]:
df_r[['text_', 'capital_ratio', 'punctuation_count', 'excessive_punctuation']].head(10)

Unnamed: 0,text_,capital_ratio,punctuation_count,excessive_punctuation
0,"Love this! Well made, sturdy, and very comfor...",0.070175,5,0
1,"love it, a great upgrade from the original. I...",0.016393,3,0
2,This pillow saved my back. I love the look and...,0.038462,2,0
3,"Missing information on how to use it, but it i...",0.032258,2,0
4,Very nice set. Good quality. We have had the s...,0.045455,2,0
5,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1.0,1,0
6,They are the perfect touch for me and the only...,0.028571,1,0
7,These done fit well and look great. I love th...,0.029851,1,0
8,"Great big numbers & easy to read, the only thi...",0.032787,3,0
9,My son loves this comforter and it is very wel...,0.035088,1,0


#### Pre-processing


In [43]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [44]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

BEFORE: I don't like this product
AFTER : I do not like this product
----------------------------------------
BEFORE: It's not what I've expected
AFTER : It is not what I have expected
----------------------------------------
BEFORE: You're going to love it
AFTER : You are going to love it
----------------------------------------
BEFORE: They can't believe it's true
AFTER : They cannot believe it is true
----------------------------------------
BEFORE: This is fine
AFTER : This is fine
----------------------------------------


In [45]:
# cleaning text - lowercase, url, html tags, punctiation, whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [46]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [47]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [48]:
df_r[['text_', 'expanded_text', 'clean_text']].sample(5)

Unnamed: 0,text_,expanded_text,clean_text
40101,"Since we can never have enough umbrellas, I wa...","Since we can never have enough umbrellas, I wa...",since we can never have enough umbrellas i was...
35870,ThinkFun provided me with this new Daily Puzzl...,ThinkFun provided me with this new Daily Puzzl...,thinkfun provided me with this new daily puzzl...
38275,"WAY too large, I have a 9 1/2"" wrist and it fi...","WAY too large, I have a 9 1/2"" wrist and it fi...",way too large i have a wrist and it fits just ...
27254,This was an Awesome read. The characters were...,This was an Awesome read. The characters were...,this was an awesome read the characters were w...
7427,Bought this to try for reducing wind noise whi...,Bought this to try for reducing wind noise whi...,bought this to try for reducing wind noise whi...


In [49]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)

INPUT : WOW!!! 10/10 would buy again!!! üòç
OUTPUT: wow would buy again
------------------------------
INPUT : <p>Best product ever</p>
OUTPUT: best product ever
------------------------------
INPUT : Visit http://example.com NOW
OUTPUT: visit now
------------------------------
INPUT :    Multiple     spaces   
OUTPUT: multiple spaces
------------------------------
INPUT : None
OUTPUT: 
------------------------------


In [50]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

KeyboardInterrupt: 

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

In [None]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

In [None]:
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0

In [None]:
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [None]:
df_r[['clean_text', 'adjective_ratio']].head()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']

In [None]:
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [None]:
df_r[['clean_text', 'sentiment_score']].head(10)

In [None]:
df_r['text_length'] = df_r['clean_text'].str.split().str.len()

In [53]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [54]:
import pandas as pd

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    doc = nlp(text_)
    
    lemmatized_words = [
        token.lemma_
        for token in doc
        if not token.is_space
    ]
    
    return " ".join(lemmatized_words)

In [55]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)

BEFORE: running faster than others
AFTER : run fast than other
----------------------------------------
BEFORE: better products were bought
AFTER : well product be buy
----------------------------------------
BEFORE: he was buying expensive items
AFTER : he be buy expensive item
----------------------------------------


In [None]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [None]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

In [None]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [None]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=0.05,
    max_df=0.9,
    stop_words=None
)

X_tfidf = tfidf.fit_transform(df_r['review'])

In [None]:
X_tfidf.shape

In [None]:
tfidf.get_feature_names_out()[:20]

In [None]:
X_tfidf                   
df_r[['adjective_ratio',
      'sentiment_score',
      'text_length',
      'capital_ratio',
      'punctuation_count']].head()

In [None]:
from scipy.sparse import hstack
X_extra = df_r[
    ['adjective_ratio',
     'sentiment_score',
     'text_length',
     'capital_ratio',
     'punctuation_count']
].values

In [None]:
X_extra.shape

In [None]:
X_final = hstack([X_tfidf, X_extra])

In [None]:
X_tfidf.shape
X_extra.shape
X_final.shape

#### Preprocessed dataset

In [None]:
df_r.columns

In [None]:
pre_df = df_r[
    ['text_','clean_text', 'review', 'rating','label', 'text_length',
     'capital_ratio', 'punctuation_count', 'excessive_punctuation',
     'adjective_ratio', 'sentiment_score']
]

# Save as CSV
pre_df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as CSV!")