In [1]:
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util
import torch

# Load models
kw_model = KeyBERT(model='all-mpnet-base-v2')  # Improved keyword extraction
embedding_model = SentenceTransformer('paraphrase-mpnet-base-v2')  # Enhanced semantic similarity

# Synonym-enriched buckets
buckets = [
    "Alloy", "Bluetooth", "Clearance", "Color", "Comfort", "Console", "Dealer",
    "Engine", "Engine performance", "Engine sound", "Mileage", "Experience", "Exterior",
    "Fuel efficiency", "Fuel economy", "Gear", "Gearbox", "Performance", "Suspension",
    "Ground clearance", "Handling", "Headlamp", "Insurance", "KMPL", "LED", "Lights", 
    "Looks", "Luggage", "Maintenance", "Maintenance cost", "Navigation", "Pickup",
    "Power", "Price", "Affordable", "Value", "Cost", "RPM", "Rear", "Rear seat", 
    "Safety", "Safety feature", "Seat", "Seat cover", "Service", "Service center",
    "Service cost", "Showroom", "Spare part", "Speed", "Style", "Design", "Test drive", 
    "Torque", "Transmission", "Turning radius", "Tyres", "Vent", "Wheel", "Boot", "Ride quality"
]

# File details
file_path = "bajaj_avenger_220_street_reviews.csv"  # Replace with your actual file name
bike_name = " ".join(file_path.split("_")[:-1])  # Extract bike name from file name (without 'reviews')
bike_name_words = bike_name.lower().split()  # Split bike name into individual words

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Check if 'Review_Text' column exists
if 'Review_Text' not in df.columns:
    raise ValueError("The CSV file must contain a 'Review_Text' column.")

# Function to extract keywords for a single review
def extract_keywords_without_bike_name(text, bike_name_words, top_n=10):
    if pd.isna(text):  # Handle missing or NaN values
        return []
    
    # Extract keywords using KeyBERT
    keywords = kw_model.extract_keywords(text, 
                                         keyphrase_ngram_range=(1, 2), 
                                         stop_words='english', 
                                         top_n=top_n)
    
    # Filter out bike name words from the keywords
    filtered_keywords = [
        keyword for keyword, score in keywords 
        if not any(word in keyword.lower() for word in bike_name_words)
    ]
    return filtered_keywords

# Function to find the best matching bucket
def assign_bucket(keywords, buckets, embedding_model, similarity_threshold=0.5):  # Lowered threshold
    if not keywords:  # No keywords extracted
        return "Other"
    
    # Generate embeddings for buckets
    bucket_embeddings = embedding_model.encode(buckets, convert_to_tensor=True)

    for keyword in keywords:
        keyword_embedding = embedding_model.encode(keyword, convert_to_tensor=True)
        # Compute similarity scores
        similarities = util.pytorch_cos_sim(keyword_embedding, bucket_embeddings)
        max_similarity, best_bucket_index = torch.max(similarities, dim=1)

        # Check if the similarity is above the threshold
        if max_similarity.item() > similarity_threshold:
            return buckets[best_bucket_index.item()]
    
    return "Other"  # If no match found

# Apply the functions to extract keywords and assign buckets
df['Keywords'] = df['Review_Text'].apply(
    lambda x: extract_keywords_without_bike_name(x, bike_name_words, top_n=10)
)
df['Assigned_Bucket'] = df['Keywords'].apply(
    lambda x: assign_bucket(x, buckets, embedding_model, similarity_threshold=0.5)  # Relaxed threshold
)

# Convert keywords to a string for better readability in the output
df['Keywords'] = df['Keywords'].apply(lambda x: ", ".join(x) if x else "")

# Save the DataFrame to a new CSV file
output_file = "synonym_enriched_reviews_with_10_keywords.csv"
df[['Keywords', 'Assigned_Bucket', 'Review_Text']].to_csv(output_file, index=False)

print(f"Synonym-enriched bucketized reviews with keywords saved to {output_file}.")


2024-11-27 09:23:51.732942: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-27 09:23:51.782136: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-27 09:23:51.783296: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FileNotFoundError: [Errno 2] No such file or directory: 'bajaj_avenger_220_street_reviews.csv'