In [10]:
from datetime import datetime
import spacy

# Load spaCy's English tokenizer with optimizations
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # Disable unnecessary components

# Define an optimized cleaning function using spaCy
def clean_text_spacy_optimized(text):
    doc = nlp(text.lower())  # Lowercase conversion before tokenizing
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]  # Filter stopwords and non-alphabetic tokens
    return " ".join(tokens)

# Example data
mock_data = ["This is a sample review! Amazing food, great service."] * 1000

# Start timing
start_time = datetime.now()

# Apply cleaning to the dataset
cleaned_reviews = [clean_text_spacy_optimized(review) for review in mock_data]

# End timing
end_time = datetime.now()

# Print timing information
print(f"Start Time: {start_time}")
print(f"End Time: {end_time}")
print(f"Processing Time: {end_time - start_time}")


Start Time: 2025-01-01 17:16:07.675393
End Time: 2025-01-01 17:16:52.019125
Processing Time: 0:00:44.343732


In [12]:
from datetime import datetime
import dask.dataframe as dd
import spacy

# Load spaCy's English tokenizer with optimizations
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # Disable unnecessary components

# Define an optimized cleaning function using spaCy
def clean_text_spacy_optimized(text):
    doc = nlp(text.lower())  # Lowercase conversion before tokenizing
    tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]  # Filter stopwords and non-alphabetic tokens
    return " ".join(tokens)

# Load dataset using Dask
ddf = dd.read_csv("../data/New_York_reviews_cleaned.csv")  # Replace with your dataset file path

# Start timing
start_time = datetime.now()

# Apply the cleaning function using Dask's map
ddf['cleaned_review'] = ddf['review_full'].map(clean_text_spacy_optimized, meta=('review_full', 'str'))

# Execute the computation and convert back to Pandas
df_cleaned = ddf.compute()

# End timing
end_time = datetime.now()

# Print timing information
print(f"Start Time: {start_time}")
print(f"End Time: {end_time}")
print(f"Processing Time: {end_time - start_time}")

# Save the cleaned dataset
df_cleaned.to_csv("../data/New_York_reviews_cleaned_with_spacy.csv", index=False)


Start Time: 2025-01-01 18:23:41.324082
End Time: 2025-01-01 19:20:30.406917
Processing Time: 0:56:49.082835


NameError: name 'df_cleaned' is not defined

In [6]:
import pandas as pd
df_test = pd.read_csv('../data/New_York_reviews_cleaned_with_spacy.csv')


In [8]:
print(df_test[['review_full', 'cleaned_review']].head())


                                         review_full  \
0  My wife and I have been eating dinner frequent...   
1  Came with family for Labor Day weekend brunch ...   
2  Food was mediocre at best. The lamb chops are ...   
3  My co-workers were volunteering at a foodbank ...   
4  Lido is an intimate boutique style restaurant....   

                                      cleaned_review  
0  wife eating dinner frequently lido virtually d...  
1  came family labor day weekend brunch daughter ...  
2  food mediocre best lamb chops image feature we...  
3  co workers volunteering foodbank corner came l...  
4  lido intimate boutique style restaurant servin...  


In [23]:
print(df_test.columns) 

Index(['restaurant_name', 'rating_review', 'sample', 'review_id',
       'title_review', 'review_preview', 'review_full', 'date', 'city',
       'url_restaurant', 'author_id', 'review_length', 'cleaned_review'],
      dtype='object')


In [24]:
print(df_test.head()) 

  restaurant_name  rating_review    sample         review_id  \
0            Lido              5  Positive  review_773559838   
1            Lido              4  Positive  review_769429529   
2            Lido              1  Negative  review_745700258   
3            Lido              5  Positive  review_728859349   
4            Lido              5  Positive  review_728429643   

              title_review                                     review_preview  \
0          A Regular Treat  My wife and I have been eating dinner frequent...   
1  Good neighborhood spot!  Came with family for Labor Day weekend brunch ...   
2            Disappointing  Food was mediocre at best.  The lamb chops are...   
3    What a find in Harlem  My co-workers were volunteering at a foodbank ...   
4                    Lunch  Lido is an intimate boutique style restaurant....   

                                         review_full               date  \
0  My wife and I have been eating dinner frequent... 

In [11]:
print(df_test['cleaned_review'].isnull().sum())  # Count missing values


0


In [10]:
df_test['cleaned_review'] = df_test['cleaned_review'].fillna('')


In [13]:
df_test['cleaned_review'] = df_test['cleaned_review'].astype(str)


In [20]:
df_test.to_csv("../data/New_York_reviews_cleaned_with_spacy.csv", index=False)


In [21]:
print(df_test.shape)

(509612, 13)


In [16]:
print(df_test['cleaned_review'].isnull().sum())  # Check for NaN values
print(df_test['cleaned_review'].dtype)          # Check the column data type


0
object


In [22]:
# Count empty strings
print((df_test['cleaned_review'] == "").sum())  # Count exact empty strings

# Count whitespace-only strings
print((df_test['cleaned_review'].str.isspace()).sum())  # Count strings with only spaces


0
0


In [18]:
#Drop Rows with Empty Strings
df_test['cleaned_review'] = df_test['cleaned_review'].replace("", "[empty_review]")
