In [None]:
import requests
import pandas as pd
import time
import os
import json
import datetime

today = datetime.date.today()
one_week_later = today + datetime.timedelta(days=7)

checkin_date = today.strftime("%Y-%m-%d")
checkout_date = one_week_later.strftime("%Y-%m-%d")

#  French cities
cities = ["Mont Saint Michel", "St Malo", "Bayeux", "Le Havre", "Rouen", "Paris", "Amiens", "Lille", "Strasbourg",
          "Chateau du Haut Koenigsbourg", "Colmar", "Eguisheim", "Besancon", "Dijon", "Annecy", "Grenoble", "Lyon",
          "Gorges du Verdon", "Bormes les Mimosas", "Cassis", "Marseille", "Aix en Provence", "Avignon", "Uzes",
          "Nimes", "Aigues Mortes", "Saintes Maries de la mer", "Collioure", "Carcassonne", "Ariege", "Toulouse",
          "Montauban", "Biarritz", "Bayonne", "La Rochelle"]

# Get the API key from the environment variable
OWM_API_KEY = "WEATHER_API_KEY"

base_url = "https://hotels-com-provider.p.rapidapi.com"
headers = {
    "X-RapidAPI-Key": "API_KEY",
    "X-RapidAPI-Host": "hotels-com-provider.p.rapidapi.com"
}


def get_gps_coordinates(city):
    headers = {"User-Agent": "touristicsapp"}
    response = requests.get(f"https://nominatim.openstreetmap.org/search?city={city}&format=json", headers=headers)
    data = json.loads(response.text)
    if data:
        return data[0]["lat"], data[0]["lon"]
    else:
        return None, None

def get_weather(lat, lon):
    response = requests.get(f"http://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={OWM_API_KEY}&units=metric")
    data = json.loads(response.text)
    if 'main' in data:
        weather_description = data["weather"][0]["description"]
        temperature = data["main"]["temp"]
        return weather_description, temperature
    else:
        print(f"Unexpected response from weather API: {data}")
        return None, None

city_data = []
for city in cities:
    lat, lon = get_gps_coordinates(city)
    time.sleep(1)  # Delay to prevent heavy usage
    if lat and lon:
        weather, temperature = get_weather(lat, lon)
        city_data.append({
            'city': city,
            'lat': lat,
            'lon': lon,
            'weather': weather,
            'temperature': temperature
        })

#  I convert the city data into a pandas DataFrame below 
    
    df_cities = pd.DataFrame(city_data)

    # Fetch hotels data
for index, row in df_cities.iterrows():
    hotel_data = []
    city = row['city']
    lat = row['lat']
    lon = row['lon']
    weather = row['weather']
    temperature= row['temperature']
    print(f"Fetching region data for city: {city}")
    querystring = {"locale":"fr_FR","query": city,"domain":"FR"}
    response = requests.get(f"{base_url}/v2/regions", headers=headers, params=querystring)
    region_data = response.json()

    for data in region_data['data']:
        if data['type'] == 'CITY':
            region_id = data['gaiaId']
            coordinates = data['coordinates']
            break
    time.sleep(1)  # 1 second delay here (to prevent heavy usage)

    print(f"Fetching hotel data for city: {city}")
    hotel_count = 0
    querystring = {
        "domain": "FR",
        "sort_order": "RECOMMENDED",
        "locale": "fr_FR",
        "region_id": region_id,
        "checkin_date": checkin_date,
        "checkout_date": checkout_date,
        "adults_number": "1",
        "available_filter": "SHOW_AVAILABLE_ONLY",
        "star_rating_ids": "3,4,5"}

    response = requests.get(f"{base_url}/v2/hotels/search", headers=headers, params=querystring)
    response_json = response.json()

    if 'properties' in response_json:
        for hotel in response_json['properties']:
            if hotel['availability']['available']:
                overall_rank = round(hotel['averageOverallRating']['raw']) if 'averageOverallRating' in hotel else None
                hotel_data.append({
                    'city': city,
                    'lat': lat,
                    'lon': lon,
                    'weather': weather,
                    'temperature': temperature,
                    'id': hotel['id'],
                    'overall_rank': overall_rank,
                    'availability': hotel['availability']['available'],
                })

                response = requests.get(f"{base_url}/v2/hotels/summary", headers=headers, params={
                    "domain": "FR",
                    "locale": "fr_FR",
                    "hotel_id": hotel['id']
                })

                summary_data = response.json()
                hotel_name = summary_data['summary']['name']
                hotel_tag = summary_data['summary']['tagline']

                if hotel_name and hotel_tag:
                    hotel_data[-1].update({
                        'name': hotel_name,
                        'tag': hotel_tag
                    })

                print(f"Fetching review summary for hotel ID {hotel['id']}...")
                response = requests.get(f"{base_url}/v2/hotels/reviews/summary", headers=headers, params={
                    "domain": "FR",
                    "locale": "fr_FR",
                    "hotel_id": hotel['id']
                })

                summary_data = response.json()

                if len(summary_data) > 0:
                    overall_rating = summary_data[0].get('averageOverallRating', {}).get('raw')
                    if overall_rating is not None:
                        hotel_data[-1]['overall_rank'] = overall_rating

                hotel_count += 1
                if hotel_count >= 15:
                    break

            time.sleep(1)

    # hotel data  to be stored and in another script I will transfer it to S3 . Before all, check if null values are present
    df_hotels = pd.DataFrame(hotel_data)
    if not os.path.isfile('frenchhotels.csv'):
        df_hotels.to_csv('frenchhotels.csv', index=False)
    else:  # Else it exists so append without writing the header
        df_hotels.to_csv('frenchhotels.csv', mode='a', header=False, index=False)


In [3]:
import pandas as pd

dataframe=  pd.read_csv('frenchhotels.csv')
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   city          423 non-null    object 
 1   lat           423 non-null    float64
 2   lon           423 non-null    float64
 3   weather       423 non-null    object 
 4   temperature   423 non-null    float64
 5   id            423 non-null    int64  
 6   overall_rank  420 non-null    float64
 7   availability  423 non-null    bool   
 8   name          423 non-null    object 
 9   tag           423 non-null    object 
dtypes: bool(1), float64(4), int64(1), object(4)
memory usage: 30.3+ KB
