<a href="https://colab.research.google.com/github/ramiz11/Yelp-EDA/blob/main/yelp_data_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *yelp* dataset - data selection for EDA
This part has been run locally, in order to deal with Github's upload size limitation, which is 25MB per file.
Here we load the Yelp datasets, create and save a subset = all restaurants in a selected city, and save it for Github upload 

In [None]:
import pandas as pd
import json
from pathlib import Path
from os.path import join
import pickle                 

### Read & save the pre-downloaded Yelp Json files 

In [None]:
def load_rows(filepath, skip=0, nrows = None):
    with open(filepath, encoding='utf8') as json_file:
        read_count, load_count = 0, 0
        objs = []
        line = json_file.readline()
        while (nrows is None or load_count < nrows) and line:
            read_count += 1
            if read_count > skip:
                obj = json.loads(line)
                objs.append(obj)
                load_count += 1
                if load_count % 10000 == 0:
                    print(load_count, 'loaded')
            line = json_file.readline()
    return pd.DataFrame(objs)

dirname = 'c:/data/yelp/'
EDA_dir = 'c:/data/yelp/eda/'

business = pd.read_json(Path(dirname, 'yelp_academic_dataset_business.json'), lines=True)
business.to_pickle(Path(dirname, 'business.pkl'))

checkin = pd.read_json(Path(dirname, 'yelp_academic_dataset_checkin.json'), lines=True)
checkin.to_pickle(Path(dirname, 'checkin.pkl'))

# read reviews in 2 parts due to its size
rev1 = load_rows(Path(dirname, 'yelp_academic_dataset_review.json'), 0, 4000000)
rev2 = load_rows(Path(dirname, 'yelp_academic_dataset_review.json'), 4000000)
review = pd.concat([rev1, rev2])
review.to_pickle(Path(dirname, 'review.pkl'))

tip = pd.read_json(Path(dirname, 'yelp_academic_dataset_tip.json'), lines=True)
tip.to_pickle(Path(dirname, 'tip.pkl'))

user = pd.read_json(Path(dirname, 'yelp_academic_dataset_user.json'), lines=True)
user.to_pickle(Path(dirname, 'user.pkl')) 

### Subset the data for our EDA

##### Focus on restaurants: 
how many restaurants are there out of all businesses?


In [None]:
is_rest = business['categories'].str.contains('Restaurant', na=False)
is_rest.value_counts(normalize=True, ascending=True).round(2)

##### So it's about a third. Let's subset and check the expected files size

In [None]:
business = business[business.categories.str.contains('Restaurant', na=False)]
df_bytes = business.memory_usage(deep=True).sum()
print('business rows:', f'{len(business):,}', 'bytes in memory:', f'{df_bytes:,}')

##### Subset further - look for a medium sized city

In [None]:
business.city.value_counts()

##### Select Cleveland, subset all dataframes according to its bussinesses and users

In [None]:
business = business[business.city == 'Cleveland']
checkin = checkin[checkin.business_id.isin(business.business_id)]
tip = tip[tip.business_id.isin(business.business_id)]
review = review[review.business_id.isin(business.business_id)]

#### Reviews is pontentially another big file, check its size...

In [None]:
df_bytes = business.memory_usage(deep=True).sum()
print('review rows:', f'{len(review):,}', 'bytes in memory:', f'{df_bytes:,}')

##### Reduce reviews size by keeping them for the two most recent years: 2018, 2019



In [None]:
review = review[pd.to_datetime(review.date).dt.to_period('Y').astype(str).astype(int).isin([2018, 2019])]

##### Continue with users data, and check its size

In [None]:
user = user[user.user_id.isin(review.user_id)]
df_bytes = user.memory_usage(deep=True).sum()
print('user rows:', f'{len(user):,}', 'bytes in memory:', f'{df_bytes:,}')

##### Looks good! save in pickle files to be uploaded to Github

In [None]:
business.to_pickle(Path(EDA_dir, 'business.pkl'))
checkin.to_pickle(Path(EDA_dir, 'checkin.pkl'))
review.to_pickle(Path(EDA_dir, 'review.pkl'))
tip.to_pickle(Path(EDA_dir, 'tip.pkl'))
user.to_pickle(Path(EDA_dir, 'user.pkl'))