In [19]:
import re
import pandas as pd

# cleanup the restaurent name from the review to focus on sentiment than general info.

def compile_restaurant_pattern(restaurant_names):
    """Compile a single regex pattern for all restaurant names."""
    print("Compiling regex pattern for restaurant names...")
    return re.compile(r'\b(' + '|'.join(re.escape(name) for name in restaurant_names) + r')\b', re.IGNORECASE)

def replace_restaurant_names(text, pattern):
    """Replace restaurant names in the given text using the compiled pattern."""
    return pattern.sub('<RESTAURANT>', text)

def preprocess_reviews(data):
    """Preprocess reviews by replacing restaurant names and creating a new column."""
    print("Starting preprocessing...")
    
    # Get all unique restaurant names in lowercase
    restaurant_names = data['restaurant_name'].str.lower().unique()
    
    # Compile regex pattern once
    pattern = compile_restaurant_pattern(restaurant_names)
    
    # Replace restaurant names in the 'cleaned_review' column
    data['review_no_restaurant'] = data['cleaned_review'].apply(lambda x: replace_restaurant_names(x, pattern))
    
    print("Preprocessing completed.")
    return data



In [20]:
# Load your dataset
df_cleanreview = pd.read_csv("../data/New_York_reviews_cleaned_with_spacy.csv")

# Preprocess reviews
preprocessed_data = preprocess_reviews(df_cleanreview)
preprocessed_data.rename(columns={'sample': 'sentiment'}, inplace=True)

# Save the updated dataframe
preprocessed_data.to_csv("../data/New_York_reviews_with_no_restaurantname_in_Review.csv", index=False)


Starting preprocessing...
Compiling regex pattern for restaurant names...
Preprocessing completed.


In [21]:
print(preprocessed_data.columns)

Index(['restaurant_name', 'rating_review', 'sentiment', 'review_id',
       'title_review', 'review_preview', 'review_full', 'date', 'city',
       'url_restaurant', 'author_id', 'review_length', 'cleaned_review',
       'review_no_restaurant'],
      dtype='object')
