In [None]:
%pip install numpy pandas nltk spacy regex contractions scikit-learn 
 

In [None]:
import os
import pandas as pd
import re

#### Loading Dataset


In [None]:

file_path = "raw_dataset.csv"
print("File exists:", os.path.exists(file_path))
df_r=pd.read_csv(file_path)


File exists: True


In [None]:
df_r.head()

In [None]:
df_r.info()

In [None]:
df_r.isnull().sum()

In [None]:
df_r['text_'].duplicated().sum()

In [None]:
df_r = df_r.drop_duplicates(subset='text_').reset_index(drop=True)

In [None]:
df_r['text_'].duplicated().sum()

In [None]:
df_r['label'].value_counts()

In [None]:
df_r.columns

#### Pre Pre-processing


In [None]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

In [None]:
test_texts = [
    "This product is amazing",
    "THIS PRODUCT IS AMAZING",
    "Amazing Product!!! MUST BUY",
    "bAd",
    "",
    "OKAY"
]
for text in test_texts:
    print(f"Text: {text}")
    print("Capital Ratio:", capital_letter_ratio(text))
    print("-" * 40)

In [None]:
df_r['capital_ratio'] = df_r['text_'].apply(capital_letter_ratio)
df_r[['text_', 'capital_ratio']].head(10)

In [None]:
def punctuation_count(text):
    if not isinstance(text, str):
        return 0

    return len(re.findall(r"[^\w\s]", text))

In [None]:
df_r['punctuation_count'] = df_r['text_'].apply(punctuation_count)
df_r[['text_', 'punctuation_count']].head(10)

In [None]:
def excessive_punctuation_score(text):
    if not isinstance(text, str):
        return 0
    matches = re.findall(r"[!?]{2,}", text)
    return len(matches)

In [None]:
df_r['excessive_punctuation'] = df_r['text_'].apply(excessive_punctuation_score)
df_r[['text_', 'excessive_punctuation']].sample(10)

In [None]:
df_r[['text_', 'capital_ratio', 'punctuation_count', 'excessive_punctuation']].head(10)

#### Pre-processing


In [None]:
# contraction expansion
import contractions
def expand_contractions(text_):
    if pd.isna(text_):
        return ""
    return contractions.fix(text_)

In [None]:
test_sentences = [
    "I don't like this product",
    "It's not what I've expected",
    "You're going to love it",
    "They can't believe it's true",
    "This is fine"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", expand_contractions(s))
    print("-" * 40)

In [None]:
# cleaning text - lowercase, url, html tags, punctiation, whitespaces
def clean_text(text_):
    if pd.isna(text_):
        return ""
    
    text_ = text_.lower()
    
    text_ = re.sub(r'http\S+|www\S+', '', text_)
    text_ = re.sub(r'<.*?>', '', text_)
    
    # remove punctuation (letters + spaces only)
    text_ = re.sub(r'[^a-z\s]', '', text_)
    
    text_ = re.sub(r'\s+', ' ', text_).strip()
    
    return text_

In [None]:
df_r['expanded_text'] = df_r['text_'].apply(expand_contractions)

In [None]:
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [None]:
df_r[['text_', 'expanded_text', 'clean_text']].sample(5)

In [None]:
test_cases = [
    "WOW!!! 10/10 would buy again!!! üòç",
    "<p>Best product ever</p>",
    "Visit http://example.com NOW",
    "   Multiple     spaces   ",
    None
]

for t in test_cases:
    print("INPUT :", t)
    print("OUTPUT:", clean_text(t))
    print("-" * 30)


In [None]:
# lemmatization
import nltk

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

In [None]:
# import lemmatization tools
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

In [None]:
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0

In [None]:
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [None]:
df_r[['clean_text', 'adjective_ratio']].head()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']

In [None]:
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [None]:
df_r[['clean_text', 'sentiment_score']].head(10)

In [None]:
df_r['text_length'] = df_r['clean_text'].str.split().str.len()

In [None]:
df_r[['clean_text', 'text_length']].head()

In [None]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text_):
    if pd.isna(text_) or text_ == "":
        return ""
    
    tokens = wordpunct_tokenize(text_)
    pos_tags = pos_tag(tokens)
    
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]
    
    return " ".join(lemmatized_words)


In [None]:
test_sentences = [
    "running faster than others",
    "better products were bought",
    "he was buying expensive items"
]

for s in test_sentences:
    print("BEFORE:", s)
    print("AFTER :", lemmatize_text(s))
    print("-" * 40)

In [None]:
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [None]:
df_r[['text_', 'expanded_text', 'clean_text', 'lemmatized_text']].sample(5)

In [None]:
df_r.rename(columns={'lemmatized_text': 'review'}, inplace=True)

In [None]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X_tfidf = tfidf.fit_transform(df_r['review'])

In [None]:
X_tfidf.shape

In [None]:
tfidf.get_feature_names_out()[:20]

In [None]:
X_tfidf                   
df_r[['adjective_ratio',
      'sentiment_score',
      'text_length',
      'capital_ratio',
      'punctuation_count']].head()

In [None]:
from scipy.sparse import hstack
X_extra = df_r[
    ['adjective_ratio',
     'sentiment_score',
     'text_length',
     'capital_ratio',
     'punctuation_count']
].values

In [None]:
X_extra.shape

In [None]:
X_final = hstack([X_tfidf, X_extra])

In [None]:
X_tfidf.shape
X_extra.shape
X_final.shape

#### Preprocessed dataset

In [None]:
df_r.columns

In [None]:
# pre_df = df_r[
#     ['rating', 'review', 'label', 'text_length',
#      'capital_ratio', 'punctuation_count', 'excessive_punctuation']
# ]

# # Save as CSV
# pre_df.to_csv("preprocessed_dataset.csv", index=False)
# print("Preprocessed dataset saved as CSV!")