## **1. Setting up the API and dataset**

In [2]:
import os
os.environ['KAGGLE_USERNAME'] = "XXXXX"
os.environ['KAGGLE_KEY'] = "XXXXX"
!kaggle datasets download -d yelp-dataset/yelp-dataset
!mkdir reviews #creating a new directory for the dataset
!unzip yelp-dataset.zip -d reviews #unzipping the dataset

Downloading yelp-dataset.zip to /content
100% 4.07G/4.07G [00:55<00:00, 10.6MB/s]
100% 4.07G/4.07G [00:55<00:00, 78.4MB/s]
mkdir: cannot create directory ‘reviews’: File exists
Archive:  yelp-dataset.zip
  inflating: reviews/Dataset_User_Agreement.pdf  
  inflating: reviews/yelp_academic_dataset_business.json  
  inflating: reviews/yelp_academic_dataset_checkin.json  
  inflating: reviews/yelp_academic_dataset_review.json  
  inflating: reviews/yelp_academic_dataset_tip.json  
  inflating: reviews/yelp_academic_dataset_user.json  


## **2. Pre-Processing Methods**

#### *2.1 Data Cleaning*
*   Lowercase All Text
*   Removing Punctuations
*   Word Lemmatisation
*   Removing Stop-words



In [1]:
#!pip install datasketch
#!pip install nltk

import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def clean_data(text):
    text = text.lower()
    text = remove_punctuation(text)
    split_text = text.split()
    remove_and_lemma = [lemma.lemmatize(word) for word in split_text if word not in stop_words]
    cleaned_text = ' '.join(remove_and_lemma)
    return cleaned_text



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### *2.2 K-Shingling*

In [2]:
def k_shingle(text):
    words = text.split()
    for i in range(len(words) - 1):
        shingle = ' '.join([words[i], words[i+1]])
        yield shingle

## **3. Processing Reviews & Building MinHashLSH Index**



In [3]:
chunk_size = 10000
num_perm = 128 
threshold = 0.75

lsh = MinHashLSH(num_perm=num_perm, threshold=threshold) #initialise lsh

processed_reviews = []
num_reviews_processed = 0
num_chunks = 60

with tqdm(total=num_chunks, desc='Processing Chunks', unit='chunk') as pbar: #creating progress bar
    # Iterate over the data chunks
    for chunk_no, chunk in enumerate(pd.read_json("reviews/yelp_academic_dataset_review.json", lines=True, chunksize=chunk_size)):
        if chunk_no >= num_chunks:  #break the loop after desired number of chunks
            break

        chunk['pre_cleaned_text'] = chunk['text']  #add pre cleaned text column
        chunk['text'] = chunk['text'].apply(clean_data)

        processed_chunk = []
        for ind, row in chunk.iterrows():  
            shingles = list(k_shingle(row['text'])) #k-shingling
            minhash = MinHash(num_perm=num_perm) 
            for shingle in shingles:
                minhash.update(shingle.encode('utf-8'))
            processed_chunk.append({
                'review_id': row['review_id'],
                'user_id': row['user_id'],
                'business_id': row['business_id'],
                'stars': row['stars'],
                'pre_cleaned_text': row['pre_cleaned_text'],
                'shingles': shingles,
                'minhash': minhash
            })

        if chunk_no < 1:
            print("First 5 Processed Reviews:") #printing first 5 processed reviews
            for i in range(5):
                print(f"Review {i+1}:")
                review = processed_chunk[i]
                print(f"Review ID: {review['review_id']}")
                print(f"User ID: {review['user_id']}")
                print(f"Business ID: {review['business_id']}")
                print(f"Stars: {review['stars']}")
                print(f"Pre-cleaned Text: {review['pre_cleaned_text']}")
                print(f"Shingles: {review['shingles']}")
                print()

        for item in processed_chunk:
            lsh.insert(item['review_id'], item['minhash']) #index minhash objects in lsh
        
        processed_reviews.extend(processed_chunk)
        pbar.update(1)

Processing Chunks:   0%|          | 0/60 [00:00<?, ?chunk/s]

First 5 Processed Reviews:
Review 1:
Review ID: KU_O5udG6zpxOg-VcAEodg
User ID: mh_-eMZ6K5RLWhZyISBhwA
Business ID: XQfwVwDr-v0ZS3_CbbE5Xw
Stars: 3
Pre-cleaned Text: If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. 

The food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.
Shingles: ['decide eat', 'eat aware', 'aware going', 'going take', 'take 2', '2 hour', 'hour beginning', 'beginning end', 'end tried', 'tried multiple', 'multiple time', 'time want', 'want like', 'like location', 'location nj', 'nj never', 'never bad', 'bad experience', 'experience food', 'food good', 'good take', 'tak

Processing Chunks: 100%|██████████| 60/60 [30:12<00:00, 30.22s/chunk]



## **4. Finding Similar Pairs**

In [4]:
similar_pairs = []
for chunk_no, chunk in enumerate(pd.read_json("reviews/yelp_academic_dataset_review.json", lines=True, chunksize=chunk_size)):
    if chunk_no >= num_chunks:  #break the loop after the specified number of chunks
        break

    chunk['pre_cleaned_text'] = chunk['text']  #add pre clean text column
    chunk['text'] = chunk['text'].apply(clean_data)

    for ind, row in chunk.iterrows():  
        shingles = list(k_shingle(row['text'])) #k-shingling
        minhash = MinHash(num_perm=num_perm)
        for shingle in shingles:
            minhash.update(shingle.encode('utf-8'))

        similar_items = lsh.query(minhash) #lsh query for similar items

        
        for item in similar_items: 
            if item != row['review_id']:  #remove self-matches
                similar_pairs.append((row['review_id'], item)) #appending similar pairs

## **5. Similar Pairs Dataframe**

In [5]:
data_clean = pd.DataFrame(processed_reviews) 

similar_pairs_df = pd.DataFrame(similar_pairs, columns=['review_id_1', 'review_id_2']) #create df with similar pairs

merged_df_1 = pd.merge(similar_pairs_df, data_clean[['review_id', #merge with original dataset using left join
                                                     'user_id', 
                                                     'business_id', 
                                                     'stars', 
                                                     'pre_cleaned_text']],
                       left_on='review_id_1', right_on='review_id', how='left')

renamed_df_1 = merged_df_1.rename(columns={'user_id': 'user_id_1', #rename for clarity
                                           'business_id': 'business_id_1',
                                           'stars': 'stars_1',
                                           'pre_cleaned_text': 'pre_cleaned_text_1'})

dropped_df_1 = renamed_df_1.drop(columns=['review_id']) #drop unnecessary columns

merged_df_2 = pd.merge(dropped_df_1, data_clean[['review_id', #merge using second review id using left join
                                                 'user_id', 
                                                 'business_id', 
                                                 'stars', 
                                                 'pre_cleaned_text']],
                       left_on='review_id_2', right_on='review_id', how='left')

renamed_df_2 = merged_df_2.rename(columns={'user_id': 'user_id_2',
                                           'business_id': 'business_id_2',
                                           'stars': 'stars_2',
                                           'pre_cleaned_text': 'pre_cleaned_text_2'})

similar_pairs_df = renamed_df_2.drop(columns=['review_id'])

similar_pairs_df = similar_pairs_df.drop(columns=['user_id_1', 'user_id_2', 'business_id_1', 'business_id_2']) #drop unnecessary columns

similar_pairs_df = similar_pairs_df[['review_id_1', 'stars_1', 'pre_cleaned_text_1', 'review_id_2', 'stars_2', 'pre_cleaned_text_2']] #rearrange columns

## **6. File Export**

In [6]:
csv_data = similar_pairs_df.to_csv(index=False) #convert df to csv

csv_filename = 'similar_pairs.csv' 
with open(csv_filename, 'w') as csv_file: #save csv file
    csv_file.write(csv_data)

from google.colab import files
files.download(csv_filename) #download csv file

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>