In [None]:
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util

# Load KeyBERT and SBERT models
kw_model = KeyBERT(model='all-MiniLM-L6-v2')  # For keyword extraction
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # For semantic similarity

# File details
file_path = "bajaj_avenger_220_street_reviews.csv"  # Replace with your actual file name
bike_name = " ".join(file_path.split("_")[:-1])  # Extract bike name from file name (without 'reviews')
bike_name_words = bike_name.lower().split()  # Split bike name into individual words

# List of buckets
buckets = [
    "Alloy", "Bluetooth", "Clearance", "Color", "Comfort", "Console", "Dealer",
    "Engine", "Engine performance", "Engine sound", "Mileage", "Experience", "Exterior",
    "Fuel efficiency", "Gear", "Gearbox", "Performance", "Suspension", "Ground clearance",
    "Headlamp", "Insurance", "KMPL", "LED", "Lights", "Looks", "Luggage", "Maintenance",
    "Maintenance cost", "Navigation", "Pickup", "Power", "Price", "RPM", "Rear",
    "Rear seat", "Safety", "Safety feature", "Seat", "Seat cover", "Service",
    "Service center", "Service cost", "Showroom", "Spare part", "Speed", "Style",
    "Test drive", "Torque", "Transmission", "Turning radius", "Tyres", "Vent", "Wheel", "Boot"
]
bucket_embeddings = sbert_model.encode(buckets, convert_to_tensor=True)  # Pre-compute bucket embeddings

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Check if 'Review_Text' column exists
if 'Review_Text' not in df.columns:
    raise ValueError("The CSV file must contain a 'Review_Text' column.")

# Function to extract top keyword and categorize it
def categorize_top_keyword(text, bike_name_words, bucket_embeddings, buckets, top_n=5):
    if pd.isna(text):  # Handle missing or NaN values
        return "others", ""
    
    # Extract keywords using KeyBERT
    keywords = kw_model.extract_keywords(text, 
                                         keyphrase_ngram_range=(1, 2), 
                                         stop_words='english', 
                                         top_n=top_n)
    
    # Filter out bike name words
    filtered_keywords = [
        keyword for keyword, score in keywords 
        if not any(word in keyword.lower() for word in bike_name_words)
    ]
    
    if filtered_keywords:
        # Take the top keyword
        top_keyword = filtered_keywords[0][0]  # First keyword (string)
        
        # Compute similarity between the top keyword and buckets
        keyword_embedding = sbert_model.encode(top_keyword, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(keyword_embedding, bucket_embeddings)[0]
        
        # Find the best matching bucket
        best_bucket_idx = similarities.argmax().item()
        best_bucket = buckets[best_bucket_idx]
        similarity_score = similarities[best_bucket_idx].item()
        
        # If similarity is above a threshold, assign the bucket; otherwise, 'others'
        return (best_bucket if similarity_score > 0.5 else "others", top_keyword)
    
    return "others", ""

# Apply the function to the 'Review_Text' column
df[['Bucket', 'Top_Keyword']] = df['Review_Text'].apply(
    lambda x: pd.Series(categorize_top_keyword(x, bike_name_words, bucket_embeddings, buckets, top_n=5))
)

# Rearrange columns to Bucket, Top_Keyword, and Review_Text
df = df[['Bucket', 'Top_Keyword', 'Review_Text']]

# Save the DataFrame to a new CSV file
output_file = "output_bajaj_avenger_220_street_keywords_with_buckets.csv"
df.to_csv(output_file, index=False)

print(f"Keywords categorized into buckets based on meaning and saved to {output_file}.")


Keywords and Review_Text saved to output_bajaj_avenger_220_street_keywords.csv in the correct order.
