**CUISINE-AND-LOCATION-CHOICES**

In [None]:
"""
### *Cuisine:*

American,Italian, Chinese,Japanese,Mexican,French,Indian,Thai,Greek,Middle Eastern,Korean,Jewish,Ethiopian,Vegan,Seafood

### Location:

1. Times Square
2. Chinatown
3. Little Italy
4. Greenwich Village
5. Chelsea Market
6. Lower East Side
7. Midtown
8. Harlem
9. Williamsburg, Brooklyn
10. Astoria, Queens
11. Flushing, Queens
12. DUMBO, Brooklyn
13. Murray Hill
14. Food Halls
15. Food Trucks
16. The Bronx
17. Queens Night Market

"""

**EXTRACT DATA USING YELP-API**

In [None]:
from __future__ import print_function
import requests
import csv
from urllib.parse import quote


# Constants
API_KEY = '94-YVSzhfWpz8gsYO_l8aVV17p7eRMQq1LmUqsjb1Ybfu-lG-LtLf01x2Kv3NpWdHCmsZXK482N9oiXiw8gw6hcJpNMEyyBpeEfv7AKrMnq9f4JKqb6K8tM_DeFNXnYx'
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
SEARCH_LIMIT = 50

def request(host, path, api_key, url_params=None):
    """Make a request to the Yelp API and return the JSON response."""
    url_params = url_params or {}
    url = f'{host}{quote(path.encode("utf8"))}'
    headers = {
        'Authorization': f'Bearer {api_key}',
    }
    print(f'Querying {url} ...')
    response = requests.request('GET', url, headers=headers, params=url_params)
    return response.json()

def search_businesses(api_key, term, location, offset):
    """Search for businesses based on the term, location, and offset."""
    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'offset': offset,
        'limit': SEARCH_LIMIT
    }
    return request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)

def get_total(api_key, term, location):
    """Get the total number of businesses based on the term and location."""
    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT
    }
    response = request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)
    return response.get('total')


def process_businesses(businesses, filename, cuisine, location):
    """Process the list of businesses and save to a CSV file."""
    list_headers = ["bID", "name", "address", "cord", "numOfReview", "rating", "zipcode", "cuisine"]

    with open(filename, "a", newline='') as fp:
        wr = csv.writer(fp, dialect='excel')
        wr.writerow(list_headers)

    print_var = []
    for buis in businesses:
        for b in buis:
            print_var.append(b)

    if not businesses:
        return

    for b in print_var:
        b_id = b['id']
        name = b['name']
        address = ', '.join(b['location']['display_address'])
        num_of_review = int(b['review_count'])
        rating = float(b['rating'])

        if b['coordinates'] and b['coordinates']['latitude'] and b['coordinates']['longitude']:
            cord = f"{b['coordinates']['latitude']}, {b['coordinates']['longitude']}"
        else:
            cord = None

        zipcode = b['location']['zip_code'] if b['location'].get('zip_code') else None

        temp_arr = [b_id, name, address, cord, num_of_review, rating, zipcode, cuisine]

        with open(filename, "a", newline='') as fp:
            wr = csv.writer(fp, dialect='excel')
            wr.writerow(temp_arr)

    print(f"Added {cuisine} restaurants at location: {location}")

def query_api():
    """Query the Yelp API for restaurants based on cuisine and location."""
    cuisines = ['American', 'Italian', 'Chinese', 'Japanese', 'Mexican', 'French', 'Indian', 'Thai', 'Greek', 'Middle Eastern', 'Korean', 'Vegan', 'Seafood']
    locations = ['Manhattan', 'Bronx', 'Queens', 'Brooklyn', 'Staten Island']
    filename = "Restaurants.csv"

    for loc in locations:
        for cuisine in cuisines:
            new_term = f'{cuisine} restaurants'
            total = get_total(API_KEY, new_term, loc)
            print(total, cuisine)
            run = 0
            max_offset = min(20, int(total / 50))
            businesses = []

            for offset in range(0, max_offset + 1):
                response = search_businesses(API_KEY, new_term, loc, offset * 50)
                if response.get('businesses') is None:
                    break
                businesses.append(response.get('businesses'))

            process_businesses(businesses, filename, cuisine, loc)

if __name__ == "__main__":
    query_api()


**DEDUP-Restaurants**
- dedup using pandas
- print-stats Restaurants vs Cuisines

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('/content/Restaurants.csv')

# Deduplicate based on 'bID' index
df_deduplicated = df.drop_duplicates(subset=['bID'])
df_sorted = df_deduplicated.sort_values(by='cuisine')


# Save the deduplicated DataFrame to a new CSV file
df_deduplicated.to_csv('restaurant_dedup.csv', index=False)

print('Deduplication completed. Saved to deduplicated_file.csv.')

Unnamed: 0,bID,name,address,cord,numOfReview,rating,zipcode,cuisine
0,Q-zg3X3OQ0gNABmwlgiczw,The Naked Pig,"922 3rd Ave, New York, NY 10022","40.759429, -73.968546",22,4.0,10022,American
1,lu0XMmcPyFGvIgqRcbU_4g,Le Jardinier,"610 Lexington Ave, New York, NY 10022","40.7582033, -73.9713074",67,4.0,10022,American
2,11Nbr2oTRB5Uqi36L8if8g,The Consulate - Midtown,"44 W 56th St, New York, NY 10019","40.76309, -73.97661",55,4.5,10019,American
3,gZz9A8k8ORC_xl0aHxtY4w,Monkey Bar,"60 E 54th St, New York, NY 10022","40.75998, -73.97313",110,4.0,10022,American
4,BNaveJmi-OUGKG9sRfrIhA,Mr. Sun,"48 W 48th St, New York, NY 10036","40.75809, -73.98026",251,4.5,10036,American


In [None]:
cuisine_counts = df_deduplicated.groupby('cuisine').size().reset_index(name='count')
print(cuisine_counts)

Unnamed: 0,cuisine,count
0,American,1959
1,Chinese,1975
2,French,932
3,Greek,721
4,Indian,895
5,Italian,1448
6,Japanese,1196
7,Korean,658
8,Mexican,1648
9,Middle Eastern,357


**OPENSEARCH DATA-UPLOAD SCRIPT**

In [None]:
file_path = '/content/rest_index.json'
with open(file_path, 'a') as file:
    for idx, row in df.iterrows():
      bID = row['bID']
      cuisine = row['cuisine']
      file.write('{"index": {"_index": "yelpdata", "_id": "%s"}}\n' % str(bID))
      file.write('{"Restaurant": "%s", "cuisine": "%s"}\n' % (str(bID), str(cuisine)))

**DYNAMO-DB INSERTION CODE VIA LAMBDA FUNCTION**

In [None]:
import json
import boto3
import datetime
import csv
from decimal import Decimal

from botocore.vendored import requests

# Constants
CSV_FILE_PATH = 'restaurant_dedup.csv'
DYNAMODB_REGION = 'us-east-1'
DYNAMODB_TABLE_NAME = 'YelpRestaurants'

# Function to read CSV data
def read_restaurant_data():
    with open(CSV_FILE_PATH, newline='') as f:
        reader = csv.reader(f)
        # Skip header
        next(reader)
        restaurants = list(reader)
    return restaurants

# Function to insert data into DynamoDB
def insert_data(restaurants):
    dynamodb = boto3.resource('dynamodb', region_name=DYNAMODB_REGION)
    table = dynamodb.Table(DYNAMODB_TABLE_NAME)

    for restaurant in restaurants:
        table_entry = {
            'id': restaurant[0],
            'name': restaurant[1],
            'address': restaurant[2],
            'coordinates': restaurant[3],
            'review_count': int(restaurant[4]),
            'rating': Decimal(restaurant[5]),
            'zip_code': restaurant[6],
            'cuisine': restaurant[7]
        }

        # Insert item into DynamoDB
        item = {
            'insertedAtTimestamp': str(datetime.datetime.now()),
            'bID': table_entry['id'],
            'name': table_entry['name'],
            'address': table_entry['address'],
            'coord': table_entry['coordinates'],
            'numOfReviewreview': table_entry['review_count'],
            'rating': table_entry['rating'],
            'zipcode': table_entry['zip_code'],
            'cuisine': table_entry['cuisine']
        }

        response = table.put_item(Item=item)

def lambda_handler(event, context):
    print(event)
    restaurants = read_restaurant_data()
    insert_data(restaurants)

    resp = {'statusCode': 200}
    return resp