In [19]:
import os 
import requests
from dotenv import load_dotenv
from hashlib import md5
import json

import pandas as pd

from urllib.parse import urljoin

In [4]:
DATA_PATH = 'data/0_yelp_data/'

In [27]:
API_URL = 'https://api.yelp.com/v3/'
API_URL_GRAPHQL = "https://api.yelp.com/v3/graphql"

### Chargement des variables d'environnement

In [23]:
load_dotenv()
YELP_CLIENT_ID = os.getenv('YELP_CLIENT_ID')
YELP_API_KEY = os.getenv('YELP_API_KEY')
YELP_APP = os.getenv('YELP_APP')

In [44]:
if not YELP_CLIENT_ID:
    raise Exception(
        f"""No YELP_CLIENT_ID in environment variable.
        You need a environment variable corresponding to a Yelp Client ID.
        Then set it in a .env file in the same folder as this script.""")

if not YELP_API_KEY:
    raise Exception(
        f"""No YELP_API_KEY in environment variable.
        You need a environment variable corresponding to a Yelp API Key.
        Then set it in a .env file in the same folder as this script.""")

if not YELP_API_KEY:
    raise Exception(
        f"""No YELP_APP in environment variable.
        You need a environment variable corresponding to a Yelp API Key.
        Then set it in a .env file in the same folder as this script.""")

In [45]:
uri = 'businesses/search'
url = urljoin(API_URL, uri)

### Premier test

In [59]:
headers = {
    'Authorization': f'Bearer {YELP_API_KEY}',
    'Content-Type': 'application/graphql',
    }

In [60]:
params = {'term':'restaurants',
          'location':'Paris',
          'offset':0, 'limit':50}

In [61]:
request = requests.get(url, params=params,
                       headers=headers)

In [62]:
print(f'The status code is {request.status_code}')

The status code is 200


In [50]:
reponse = request.text

GET https://api.yelp.com/v3/businesses/{id}/reviews

### Fonction Fetch

In [51]:
def fetch_restaurants_data(location):

    businesses = pd.DataFrame(
        columns=[
            "business_alias",
            "business_review_count",
            "business_rating",
            "business_price",
            "business_city",
            "business_country",
            "business_categories",
            "business_parent_categories",
        ]
    )
    reviews = pd.DataFrame(
        columns=[
            "business_alias",
            "review_text",
            "review_rating",
        ]
    )
    photos = pd.DataFrame(
        columns=[
            "business_alias",
            "photo_url",
        ]
    )
    
    count = 200
    limit = 50 

    for offset in range(0, count, limit):
        query = f'{{\n\
                    search(\
                        categories: "restaurants", \
                        location: "{ location }", \
                        offset: { offset }, \
                        limit:  { limit }\
                    ) {{\n\
                        business {{\n\
                            alias\n\
                            review_count\n\
                            rating\n\
                            price\n\
                            location {{\n\
                                city\n\
                                country\n\
                            }}\n\
                            categories {{\n\
                                alias\n\
                                parent_categories {{\n\
                                    alias\n\
                                }}\n\
                            }}\n\
                            photos\n\
                            reviews {{\n\
                                text\n\
                                rating\n\
                            }}\n\
                        }}\n\
                    }}\n\
                }}'
        headers = {
        'Authorization': 'Bearer %s' % yelp_key,
        'Content-Type': 'application/graphql',
        }
        response = requests.post(API_URL_GRAPHQL, headers=headers, data=query)
        
        if not response.status_code == 200:
            raise Exception(
                "Yelp API request failed with status code "
                + str(response.status_code)
                + f" . Response text: { response.text }"
            )


        data = response.json()

        if "errors" in data:
            raise Exception(
                f"API request failed with errors: { data['errors'] }")
            "business_alias",
            "business_review_count",
            "business_rating",
            "business_price",
            "business_city",
            "business_country",
            "business_categories",
            "business_parent_categories",
        
        for business in data.get("data", {}).get("search", {}).get("business", []):

            businesses = pd.concat([businesses,
                pd.DataFrame.from_dict({
                    "business_alias": [business.get("alias")],
                    "business_review_count": [business.get("review_count")],
                    "business_rating": [business.get("rating")],
                    "business_price": [len(business.get("price")) if business.get("price") is not None else 0],
                    "business_city": [business.get("location", {}).get("city")],
                    "business_state": [business.get("location", {}).get("state")],
                    "business_country": [business.get("location", {}).get("country")],
                    "business_categories": [json.dumps(
                        list(
                            {  # convert to a set to remove duplicates
                                cat.get("alias")
                                for cat in business.get("categories", [])
                            }
                        )
                    )],
                    "business_parent_categories": [json.dumps(
                        list(
                            {  # convert to a set to remove duplicates
                                parent_cat.get("alias")
                                for cat in business.get("categories", [])
                                for parent_cat in cat.get("parent_categories", [])
                            }
                        )
                    )],
                })
                ],
                
                ignore_index=True,
            )

            for photo in business.get("photos", []) or []:
                photos = pd.concat([photos,
                    pd.DataFrame.from_dict({
                        "business_alias": [business.get("alias")],
                        "photo_url": [photo],
                        "file_name": [business.get("alias")
                        + "_"
                        + md5(photo.encode("utf-8")).hexdigest()  # nosec: B303
                        + ".jpg"],
                    })
                    ],
                    ignore_index=True,
                )

            for review in business.get("reviews", []) or []:
                # Add the review data to the dataframe
                reviews = pd.concat([reviews,
                    pd.DataFrame.from_dict({
                        "business_alias": [business.get("alias")],
                        "review_text": [review.get("text")],
                        "review_rating": [review.get("rating")],
                    })
                    ],
                    ignore_index=True,
                )
    return businesses, reviews, photos
    
    

### Fetch

In [53]:
businesses, reviews, photos = fetch_restaurants_data('Paris')

### Sauvegarde en csv

In [40]:
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
url = urljoin(DATA_PATH, 'business.csv')
businesses.to_csv(url)

In [41]:
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
url = urljoin(DATA_PATH, 'reviews.csv')
reviews.to_csv(url)

In [42]:
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
url = urljoin(DATA_PATH, 'photos.csv')
photos.to_csv(url)