<a href="https://colab.research.google.com/github/simhermansson/NLP_Yelp_Reviews/blob/main/NLP_Yelp_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Read the two datasets to dataframes

We have one dataset for businesses, and one for reviews.
The businesses dataset is 113 MB, while the review dataset is almost 5 GB!

We must therefore be careful when loading that dataset to a dataframe, to not run out of RAM.

In [13]:
import pandas as pd

from google.colab import drive


# Specify whether training final model or only using some of the data
FINAL_MODEL = False

# Mount Google Drive
#drive.mount("/content/drive")

# Read business data to dataframe
business_df = pd.read_json("/content/drive/MyDrive/Colab Data/yelp_academic_dataset_business.json",
                           lines=True)
# Keep only strictly necessary columns
business_df = business_df[["business_id", "stars", "review_count"]]

# Read review data to a JsonReader object, due to memory constraints
review_df = None
with pd.read_json("/content/drive/MyDrive/Colab Data/yelp_academic_dataset_review.json",
                  lines=True,
                  chunksize=1000000) as reader:
    # Keep only necessary columns and combine into a full dataframe
    for chunk in reader:
        chunk = chunk[["business_id", "stars", "text"]]
        review_df = chunk if review_df is None else pd.concat([review_df, chunk],
                                                              ignore_index=True)
    del chunk

# Use a smaller review dataset if not training the final model
if not FINAL_MODEL:
    review_df = review_df.sample(frac=0.01, random_state=42).reset_index(drop=True)
# Create train and validation sets for the review data
train_review_df = review_df.sample(frac=0.8, random_state=42)
valid_review_df = review_df.drop(train_review_df.index).reset_index(drop=True)
train_review_df = train_review_df.reset_index(drop=True)
del review_df

#### Save the development train and test dataframes to Google Drive to avoid having to read the entire dataset every time.

In [14]:
if not FINAL_MODEL:
    train_review_df.to_csv("/content/drive/MyDrive/Colab Data/dev_train_review_df.csv", index=False)
    valid_review_df.to_csv("/content/drive/MyDrive/Colab Data/dev_valid_review_df.csv", index=False)

#### Read already saved training and test dataframes from Google Drive

In [20]:
train_review_df = pd.read_csv("/content/drive/MyDrive/Colab Data/dev_train_review_df.csv")
valid_review_df = pd.read_csv("/content/drive/MyDrive/Colab Data/dev_valid_review_df.csv")

print(train_review_df)

                  business_id  stars  \
0      3db1R99-mdSvx3QwryJSNQ      5   
1      ZpgVL2z1kgRi954c9m9INw      5   
2      oklGt-CSgK6EyAIUT94Ukg      1   
3      VRGYwKE_Z77frm5NwLvJhw      5   
4      cfWjbL7WaBMJZEBad0I1uw      1   
...                       ...    ...   
55917  1H8ReY5GlGcHJz7umVidkg      3   
55918  OWOOc0YjU_kioLeEgo5VCA      3   
55919  PoI93be9xfASm33sTSBpHQ      5   
55920  dybSe2rNDk1hNMpes9y7Rw      5   
55921  DFymbOEhIpWVE_d6BnHFmw      3   

                                                    text  
0      This is an awesome breakfast place. Food is al...  
1      Wide variety of coffee and tea drinks. Simple ...  
2      Horrible customer service. The managers are no...  
3      My favorite burger joint in Nashville. The pri...  
4      Cooper's Garage had my Mini for about two week...  
...                                                  ...  
55917  Jonathan Best Gourmet Grocer is an old-school,...  
55918  I was visiting Nashville with my boyfrie