In [48]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from rapidfuzz import process, fuzz
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import emoji

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\S rajiv
[nltk_data]     gandhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\S rajiv
[nltk_data]     gandhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [86]:
# Load data
reviews_df = pd.read_excel("hatchback.xlsx")
features_df = pd.read_excel("predefined_features.xlsx")

In [None]:
features_list = features_df.iloc[:, 0].dropna().str.lower().str.strip().unique().tolist()

In [None]:
# text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower() 
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans("", "", string.punctuation)) 
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words] 
    return " ".join(tokens)

# cleaning
reviews_df["Cleaned_Review"] = reviews_df["Review"].apply(clean_text)

In [None]:
# cleaned data
print(reviews_df[["Review", "Cleaned_Review", "Car", "Company"]].head())
print(f"\nTotal predefined features loaded: {len(features_list)}")

                                              Review  \
0  Value For Money\nThe best thing of this suzuki...   
1  Must Buy Good Vehicle In The Range Of Price Or...   
2  ALTO K10 REVIEW\nThis car is very much amazing...   
3  WONDERFUL CAR\nWONDERFUL CAR that I have i fee...   
4  Great In Budget.\nIt's a nice comfortable car ...   

                                      Cleaned_Review       Car        Company  
0  value money best thing suzuki k budget friendl...  Alto K10  Maruti Suzuki  
1  must buy good vehicle range price seg excellen...  Alto K10  Maruti Suzuki  
2  alto k review car much amazing also comes affo...  Alto K10  Maruti Suzuki  
3  wonderful car wonderful car feel comfortable s...  Alto K10  Maruti Suzuki  
4  great budget nice comfortable car spacious per...  Alto K10  Maruti Suzuki  

Total predefined features loaded: 204


In [None]:
# Exact and Fuzzy Matching Function
def match_features_in_review(review, threshold=90):
    matched_features = set()

    # Exact matches
    for feature in features_list:
        if feature in review:
            matched_features.add(feature)

    # Fuzzy matching
    if not matched_features:
        words = review.split()
        for word in words:
            match, score, _ = process.extractOne(word, features_list, scorer=fuzz.partial_ratio)
            if score >= threshold:
                matched_features.add(match)

    return list(matched_features)

output_rows = []

In [None]:
for index, row in reviews_df.iterrows():
    review_text = row["Cleaned_Review"]
    car = row["Car"]
    company = row["Company"]
    
    matched = match_features_in_review(review_text)
    
    for feature in matched:
        output_rows.append({
            "Feature": feature,
            "Car": car,
            "Company": company,
            "Review": row["Review"] 
        })

matched_df = pd.DataFrame(output_rows)


print(matched_df.head())
print(f"\nTotal matched rows: {len(matched_df)}")

                      Feature       Car        Company  \
0   accelerator pedal linkage  Alto K10  Maruti Suzuki   
1            fuel consumption  Alto K10  Maruti Suzuki   
2  childproof safety features  Alto K10  Maruti Suzuki   
3              a/c compressor  Alto K10  Maruti Suzuki   
4                          ac  Alto K10  Maruti Suzuki   

                                              Review  
0  Value For Money\nThe best thing of this suzuki...  
1  Value For Money\nThe best thing of this suzuki...  
2  Value For Money\nThe best thing of this suzuki...  
3  Value For Money\nThe best thing of this suzuki...  
4  Must Buy Good Vehicle In The Range Of Price Or...  

Total matched rows: 3527


In [None]:
# RoBERTa sentiment model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

labels = ['negative', 'neutral', 'positive']

In [None]:
def preprocess_roberta(text):
    text = re.sub(r"http\S+", "", text)  
    text = emoji.demojize(text)        
    text = text.strip()
    return text

In [None]:
def get_sentiment_score(text):
    text = preprocess_roberta(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=1).numpy()[0]
    score = round(probs[2] * 1 + probs[1] * 0 + probs[0] * -1, 4)
    return score

In [106]:
def get_sentiment_label(text):
    text = preprocess_roberta(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=1).numpy()[0]
    return labels[np.argmax(probs)]

In [None]:
def extract_feature_context(review, feature):
    sentences = sent_tokenize(review)
    feature = feature.lower()
    for sentence in sentences:
        if feature in sentence.lower():
            return sentence
    return review  # fallback

matched_df["Context"] = matched_df.apply(lambda row: extract_feature_context(row["Review"], row["Feature"]), axis=1)

In [None]:
def assign_category(score):
    if score >= 0.85:
        return "Highly Rated"
    elif score >= 0.65:
        return "Quality Rated"
    elif score < 0.4:
        return "Needs Improvement"
    else:
        return "Medium Rated"

In [109]:
matched_df["Sentiment_Score"] = matched_df["Context"].apply(get_sentiment_score)
matched_df["Sentiment_Label"] = matched_df["Context"].apply(get_sentiment_label)
matched_df["Category"] = matched_df["Sentiment_Score"].apply(assign_category)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
final_df = matched_df[["Feature", "Sentiment_Score", "Car", "Company", "Category"]]

In [None]:
final_df.to_excel("hatchback_output.xlsx", index=False)