In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from rapidfuzz import process, fuzz
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import emoji

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\S rajiv
[nltk_data]     gandhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\S rajiv
[nltk_data]     gandhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load data
reviews_df = pd.read_excel("mid_size_suv.xlsx")
features_df = pd.read_excel("predefined_features.xlsx")

In [None]:
features_list = features_df.iloc[:, 0].dropna().str.lower().str.strip().unique().tolist()

In [None]:
# text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove digits
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return " ".join(tokens)

# cleaning
reviews_df["Cleaned_Review"] = reviews_df["Review"].apply(clean_text)

In [None]:
# cleaned data
print(reviews_df[["Review", "Cleaned_Review", "Car", "Company"]].head())
print(f"\nTotal predefined features loaded: {len(features_list)}")

                                              Review  \
0  The Creta Is Generally Well Recieved,of Ten Pr...   
1  This Car Is\nThe hyundai creta is well regarde...   
2  Awesome Looking\nGood experience from the last...   
3  Review Regarding The Car\nOverall the car is g...   
4  BEST IN SEGMENT...TOTAL BEAST Have A Nice Look...   

                                      Cleaned_Review    Car  Company  
0  creta generally well recievedof ten praised st...  Creta  Hyundai  
1  car hyundai creta well regarded compact suv te...  Creta  Hyundai  
2  awesome looking good experience last years dri...  Creta  Hyundai  
3  review regarding car overall car great driving...  Creta  Hyundai  
4  best segmenttotal beast nice look family car s...  Creta  Hyundai  

Total predefined features loaded: 204


In [None]:
# Exact and Fuzzy Matching Function
def match_features_in_review(review, threshold=90):
    matched_features = set()

    # Exact matches
    for feature in features_list:
        if feature in review:
            matched_features.add(feature)

    # Fuzzy matching
    if not matched_features:
        words = review.split()
        for word in words:
            match, score, _ = process.extractOne(word, features_list, scorer=fuzz.partial_ratio)
            if score >= threshold:
                matched_features.add(match)

    return list(matched_features)

output_rows = []

In [None]:
for index, row in reviews_df.iterrows():
    review_text = row["Cleaned_Review"]
    car = row["Car"]
    company = row["Company"]
    
    matched = match_features_in_review(review_text)
    
    for feature in matched:
        output_rows.append({
            "Feature": feature,
            "Car": car,
            "Company": company,
            "Review": row["Review"] 
        })

matched_df = pd.DataFrame(output_rows)


print(matched_df.head())
print(f"\nTotal matched rows: {len(matched_df)}")

      Feature    Car  Company  \
0      engine  Creta  Hyundai   
1  disc brake  Creta  Hyundai   
2          ac  Creta  Hyundai   
3          ac  Creta  Hyundai   
4     sunroof  Creta  Hyundai   

                                              Review  
0  The Creta Is Generally Well Recieved,of Ten Pr...  
1  The Creta Is Generally Well Recieved,of Ten Pr...  
2  The Creta Is Generally Well Recieved,of Ten Pr...  
3  This Car Is\nThe hyundai creta is well regarde...  
4  Awesome Looking\nGood experience from the last...  

Total matched rows: 3685


In [None]:
# RoBERTa sentiment model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

labels = ['negative', 'neutral', 'positive']

In [None]:
# Clean text before feeding into RoBERTa
def preprocess_roberta(text):
    text = re.sub(r"http\S+", "", text)  
    text = emoji.demojize(text)          
    text = text.strip()
    return text

In [None]:

def get_sentiment_score(text):
    text = preprocess_roberta(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=1).numpy()[0]
    score = round(probs[2] * 1 + probs[1] * 0 + probs[0] * -1, 4)
    return score

In [13]:
def get_sentiment_label(text):
    text = preprocess_roberta(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=1).numpy()[0]
    return labels[np.argmax(probs)]

In [None]:

def extract_feature_context(review, feature):
    sentences = sent_tokenize(review)
    feature = feature.lower()
    for sentence in sentences:
        if feature in sentence.lower():
            return f"In my experience, {sentence}"
    return f"In my experience, {review}"  

matched_df["Context"] = matched_df.apply(lambda row: extract_feature_context(row["Review"], row["Feature"]), axis=1)

In [None]:

def assign_category(score):
    if score >= 0.96:
        return "Highly Rated"
    elif score >= 0.90:
        return "Quality Rated"
    elif score < 0.4:
        return "Needs Improvement"
    else:
        return "Medium Rated"

In [16]:
matched_df["Sentiment_Score"] = matched_df["Context"].apply(get_sentiment_score)
matched_df["Sentiment_Label"] = matched_df["Context"].apply(get_sentiment_label)
matched_df["Category"] = matched_df["Sentiment_Score"].apply(assign_category)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
final_df = matched_df[["Feature", "Sentiment_Score", "Car", "Company", "Category"]]

In [None]:
final_df.to_excel("mid_size_suv_output.xlsx", index=False)