In [11]:
import pandas as pd
import re

def find_missing_word_rows(data, missing_words, review_column='cleaned_review', restaurant_column='restaurant_name'):
    """
    Find rows containing missing words in the specified review column and log restaurant names.
    
    Args:
    - data: DataFrame, dataset containing the reviews.
    - missing_words: list, words missing in the embedding matrix.
    - review_column: str, column name for review text.
    - restaurant_column: str, column name for restaurant names.
    
    Returns:
    - word_to_rows: dict, mapping of missing words to row indices, review text, and restaurant names.
    """
    word_to_rows = {}
    for word in missing_words:
        rows = data[data[review_column].str.contains(rf'\b{re.escape(word)}\b', case=False, na=False)]
        word_to_rows[word] = rows[[restaurant_column, review_column]].to_dict('index')  # Capture both columns
    return word_to_rows

df_cleanreview_nonames = pd.read_csv("../data/New_York_reviews_with_no_restaurantname_in_Review.csv")
print(df_cleanreview_nonames.columns)



# Call the function
missing_words = ['<UNK>', 'chatwal', 'masalawala']  # Replace with actual missing words
word_to_rows_mapping = find_missing_word_rows(df_cleanreview_nonames, missing_words)

# Save the mapping to a log file
with open('../logs/missing_word_rows_with_restaurant.log', 'w') as log_file:
    for word, rows in word_to_rows_mapping.items():
        log_file.write(f"\nWord: {word}\n")
        for idx, content in rows.items():
            log_file.write(f"Row {idx} | Restaurant: {content['restaurant_name']} | Review: {content['cleaned_review']}\n")








Index(['restaurant_name', 'rating_review', 'sample', 'review_id',
       'title_review', 'review_preview', 'review_full', 'date', 'city',
       'url_restaurant', 'author_id', 'review_length', 'cleaned_review',
       'review_no_restaurant'],
      dtype='object')


In [13]:
# Replace 32116 with the desired row number
row_number = 32116
specific_row = df_cleanreview_nonames.iloc[row_number]

# Display the row content
print("Row Content:\n", specific_row)


Row Content:
 restaurant_name                                       Benjamin_Steakhouse
rating_review                                                           5
sample                                                           Positive
review_id                                                review_132679649
title_review                       Barney & Fred would feel @ home here!!
review_preview          The Chatwal Hotel Suggested this to us.A typic...
review_full             The Chatwal Hotel Suggested this to us. A typi...
date                                                        June 24, 2012
city                                               New_York_City_New_York
url_restaurant          https://www.tripadvisor.com/Restaurant_Review-...
author_id                                                       UID_27717
review_length                                                         759
cleaned_review          chatwal hotel suggested typical new york steak...
review_no_restaurant    