In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from bs4 import BeautifulSoup 
import requests
from datetime import datetime
import plotly.express as px
import os
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import boto3
import io
from sqlalchemy import create_engine

### Part 1 - Top 5 cities to visit based on the weather information

In [2]:
cities = ["Mont Saint Michel",
          "St Malo",
          "Bayeux",
          "Le Havre",
          "Rouen",
          "Paris",
          "Amiens",
          "Lille",
          "Strasbourg",
          "Chateau du Haut Koenigsbourg",
          "Colmar",
          "Eguisheim",
          "Besancon",
          "Dijon",
          "Annecy",
          "Grenoble",
          "Lyon",
          "Gorges du Verdon",
          "Bormes les Mimosas",
          "Cassis",
          "Marseille",
          "Aix en Provence",
          "Avignon",
          "Uzes",
          "Nimes",
          "Aigues Mortes",
          "Saintes Maries de la mer",
          "Collioure",
          "Carcassonne",
          "Ariege",
          "Toulouse",
          "Montauban",
          "Biarritz",
          "Bayonne",
          "La Rochelle"]

cities.sort()

print(cities)

['Aigues Mortes', 'Aix en Provence', 'Amiens', 'Annecy', 'Ariege', 'Avignon', 'Bayeux', 'Bayonne', 'Besancon', 'Biarritz', 'Bormes les Mimosas', 'Carcassonne', 'Cassis', 'Chateau du Haut Koenigsbourg', 'Collioure', 'Colmar', 'Dijon', 'Eguisheim', 'Gorges du Verdon', 'Grenoble', 'La Rochelle', 'Le Havre', 'Lille', 'Lyon', 'Marseille', 'Mont Saint Michel', 'Montauban', 'Nimes', 'Paris', 'Rouen', 'Saintes Maries de la mer', 'St Malo', 'Strasbourg', 'Toulouse', 'Uzes']


1.1 - Get the GPS coordinates and informations about the weather (for the 5 next days)

In [3]:
params_gps = {
    "countrycodes" : "fr",
    "format" : "json",
    "limit" : 1
}

params_weather = {
    "appid" : "cddda077b8f76aaba51ee11f810adb2e",
    "units" : "metric"
}

list_of_forecasts = []

id = 0 

for city in cities:
    id += 1
    # Call the GPS coordinates API
    r_gps = requests.get(f"https://nominatim.openstreetmap.org/search/{city}", params=params_gps).json()[0]

    # Call the weather API
    r_weather = requests.get(f"https://api.openweathermap.org/data/2.5/forecast?lat={r_gps['lat']}&lon={r_gps['lon']}", params=params_weather).json()["list"]

    for i in range(len(r_weather)):
        r_l = r_weather[i]
        if "rain" in r_l:
            rain = r_l["rain"]["3h"]
        else:
            rain = None
        list_of_forecasts.extend([{
            "id" : str(id),
            "city" : city,
            "lat" : r_gps["lat"],
            "lon" : r_gps["lon"],
            "dt" : r_l["dt_txt"],
            "temp" : r_l["main"]["temp_min"],
            "feels_like" : r_l["main"]["feels_like"],
            "humidity" : r_l["main"]["humidity"],
            "weather" : r_l["weather"][0]["main"],
            "pop" : r_l["pop"],
            "rain" : rain,
            "wind_speed" : r_l["wind"]["speed"],
        }])

# Create a dataframe
df = pd.json_normalize(list_of_forecasts)

# Dataframe info
print(df.shape)
print("A sample of the dataframe")
display(df.head())
print("Informations about the colums in the dataframe")
display(df.info())

(1400, 12)
A sample of the dataframe


Unnamed: 0,id,city,lat,lon,dt,temp,feels_like,humidity,weather,pop,rain,wind_speed
0,1,Aigues Mortes,43.5658225,4.1912837,2023-05-04 00:00:00,12.95,12.6,88,Clear,0.0,,2.23
1,1,Aigues Mortes,43.5658225,4.1912837,2023-05-04 03:00:00,13.23,12.85,86,Clouds,0.0,,2.5
2,1,Aigues Mortes,43.5658225,4.1912837,2023-05-04 06:00:00,13.8,13.32,80,Clouds,0.0,,2.53
3,1,Aigues Mortes,43.5658225,4.1912837,2023-05-04 09:00:00,18.44,17.88,59,Clouds,0.0,,3.7
4,1,Aigues Mortes,43.5658225,4.1912837,2023-05-04 12:00:00,19.27,18.74,57,Clouds,0.0,,4.64


Informations about the colums in the dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          1400 non-null   object 
 1   city        1400 non-null   object 
 2   lat         1400 non-null   object 
 3   lon         1400 non-null   object 
 4   dt          1400 non-null   object 
 5   temp        1400 non-null   float64
 6   feels_like  1400 non-null   float64
 7   humidity    1400 non-null   int64  
 8   weather     1400 non-null   object 
 9   pop         1400 non-null   float64
 10  rain        425 non-null    float64
 11  wind_speed  1400 non-null   float64
dtypes: float64(5), int64(1), object(6)
memory usage: 131.4+ KB


None

In [4]:
# Convert latitude and longitude to numeric
df["lat"] = df["lat"].astype(float)
df["lon"] = df["lon"].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          1400 non-null   object 
 1   city        1400 non-null   object 
 2   lat         1400 non-null   float64
 3   lon         1400 non-null   float64
 4   dt          1400 non-null   object 
 5   temp        1400 non-null   float64
 6   feels_like  1400 non-null   float64
 7   humidity    1400 non-null   int64  
 8   weather     1400 non-null   object 
 9   pop         1400 non-null   float64
 10  rain        425 non-null    float64
 11  wind_speed  1400 non-null   float64
dtypes: float64(7), int64(1), object(4)
memory usage: 131.4+ KB


In [5]:
# Calculate the mean for numeric columns 
df_agg = df[["city", "id", "pop", "rain", "humidity", "temp", "wind_speed"]].groupby(["id", "city"], as_index=False).mean().fillna(0).round(2)
df_agg.head()

Unnamed: 0,id,city,pop,rain,humidity,temp,wind_speed
0,1,Aigues Mortes,0.07,0.92,68.72,17.59,2.55
1,10,Biarritz,0.07,0.31,80.97,15.56,2.6
2,11,Bormes les Mimosas,0.02,0.42,67.47,17.4,2.41
3,12,Carcassonne,0.18,0.48,71.8,17.64,4.09
4,13,Cassis,0.02,0.12,62.25,18.44,2.54


In [6]:
print("Values available for the weather variable : ", df.weather.unique())

# Calculate the mode for the categorical variable weather
df_weather = df.groupby(["id"], as_index=False)["weather"].agg(lambda x: pd.Series.mode(x)[0])

df_agg = df_agg.merge(df_weather, how="inner", on="id").merge(df[["id", "lat", "lon"]].drop_duplicates(), how="inner", on="id")
df_agg.head()

Values available for the weather variable :  ['Clear' 'Clouds' 'Rain']


Unnamed: 0,id,city,pop,rain,humidity,temp,wind_speed,weather,lat,lon
0,1,Aigues Mortes,0.07,0.92,68.72,17.59,2.55,Clouds,43.565823,4.191284
1,10,Biarritz,0.07,0.31,80.97,15.56,2.6,Clouds,43.471144,-1.552727
2,11,Bormes les Mimosas,0.02,0.42,67.47,17.4,2.41,Clouds,43.150697,6.341928
3,12,Carcassonne,0.18,0.48,71.8,17.64,4.09,Clouds,43.213036,2.349107
4,13,Cassis,0.02,0.12,62.25,18.44,2.54,Clouds,43.214036,5.539632


In [7]:
df_agg = df_agg[["id", "city", "lat", "lon", "weather", "pop", "rain", "humidity", "temp", "wind_speed"]]\
                 .sort_values(["weather", "temp", "pop", "rain", "humidity", "wind_speed"],
              ascending = [True, False, True, True, True, True]).reset_index(drop=True)
df_agg.head()

Unnamed: 0,id,city,lat,lon,weather,pop,rain,humidity,temp,wind_speed
0,2,Aix en Provence,43.529842,5.447474,Clouds,0.01,0.18,52.0,18.89,2.59
1,6,Avignon,43.949249,4.805901,Clouds,0.04,0.34,58.1,18.64,3.53
2,25,Marseille,43.296174,5.369953,Clouds,0.01,0.63,62.7,18.6,2.97
3,15,Collioure,42.52505,3.083155,Clouds,0.07,0.91,69.45,18.47,3.94
4,13,Cassis,43.214036,5.539632,Clouds,0.02,0.12,62.25,18.44,2.54


In [8]:
# Export weather informations to a .csv file
df_agg.to_csv("weather_infos.csv", index=False)

In [9]:
# Create a dataframe with the top 5 best destinations
df_agg["rank"] = [len(df_agg) - x for x in df_agg.index]
df_agg.head()

Unnamed: 0,id,city,lat,lon,weather,pop,rain,humidity,temp,wind_speed,rank
0,2,Aix en Provence,43.529842,5.447474,Clouds,0.01,0.18,52.0,18.89,2.59,35
1,6,Avignon,43.949249,4.805901,Clouds,0.04,0.34,58.1,18.64,3.53,34
2,25,Marseille,43.296174,5.369953,Clouds,0.01,0.63,62.7,18.6,2.97,33
3,15,Collioure,42.52505,3.083155,Clouds,0.07,0.91,69.45,18.47,3.94,32
4,13,Cassis,43.214036,5.539632,Clouds,0.02,0.12,62.25,18.44,2.54,31


1.2 -  Top 5 cities to visit 

In [10]:
print("We recommend you these 5 destinations : ", df_agg.city.iloc[0:5].tolist())

We recommend you these 5 destinations :  ['Aix en Provence', 'Avignon', 'Marseille', 'Collioure', 'Cassis']


In [11]:
fig = px.scatter_mapbox(df_agg, 
                         lat="lat", 
                         lon="lon", 
                         color="temp", 
                         text = "city", 
                         size = "rank", 
                         title = "Recommended destinations based on the weather", 
                         width=1000, 
                         height=600,
                         mapbox_style="carto-positron", 
                         zoom = 4.3, 
                         color_continuous_scale = "Bluered")
fig.show()

### Part 2 - Get hotels informations

#### /!\ Restart the kernel if re launching

2.1 - Get avaiable hotels for the dates we have weather info about

In [12]:
checkin = df.dt.min().split()[0]
checkout = df.dt.max().split()[0]

class ScrapeBooking(scrapy.Spider):

    name = "scrape_booking"

    start_urls = ["https://www.booking.com/",
                  ]

    def parse(self, response):
        for self.city in cities:
            yield scrapy.FormRequest.from_response(
                response,
                formdata = {"ss" : self.city,
                            "checkin" : checkin,
                            "checkout" : checkout
                            },
                dont_filter = True,
                callback=self.search
                )
            
    def search(self, response):
        for hotel in response.css("div.b978843432"):
            yield {
                 "city" : response.request.url.split("ss=")[1].split("&checkin")[0].replace("+", " ").strip(),
                 "name" : hotel.css("div.fcab3ed991.a23c043802::text").get(),
                 "stars" : hotel.css("div.e4755bbd60::attr(aria-label)").get(),
                 "rating" : hotel.css("div.b5cd09854e.d10a6220b4::text").get(),
                 "price" : hotel.css("span.fcab3ed991.fbd1d3018c.e729ed5ab6::text").get(),
                 "link" : hotel.css("a.e13098a59f::attr(href)").get()
                 }

        for i in range(25, 76, 25):
            try: 
                next_page_url = response.request.url.split("&offset=")[0] + "&offset=" + str(i)
            except KeyError:
                logging.info('No next page.')
            else:
                yield response.follow(next_page_url, callback=self.search)

filename = "hotels.json"

if filename in os.listdir():
        os.remove(filename)

process = CrawlerProcess(settings = {
    "USER_AGENT": "Chrome/97.0",
    "LOG_LEVEL": logging.INFO,
    "FEEDS": {
        filename : {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True,
})

process.crawl(ScrapeBooking)
process.start()

2023-05-04 00:33:45 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2023-05-04 00:33:45 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.9.12 (main, Apr  5 2022, 01:53:17) - [Clang 12.0.0 ], pyOpenSSL 21.0.0 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 3.4.8, Platform macOS-10.16-x86_64-i386-64bit
2023-05-04 00:33:45 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True, 'LOG_LEVEL': 20, 'USER_AGENT': 'Chrome/97.0'}
2023-05-04 00:33:45 [scrapy.extensions.telnet] INFO: Telnet Password: 169f278cba65906a
2023-05-04 00:33:45 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2023-05-04 00:33:45 [scrapy.middleware] INFO: Enab

In [13]:
print("checkin date : ", checkin)
print("checkiout date : ", checkout)

checkin date :  2023-05-04
checkiout date :  2023-05-08


In [14]:
# Convert to dataframe
df_hotels = pd.read_json("hotels.json")
df_hotels.loc[df_hotels.city=="Aigues Mortes", :].city.value_counts()

Aigues Mortes    100
Name: city, dtype: int64

In [15]:
# Convert to dataframe
df_hotels = pd.read_json("hotels.json")

# Drop missing values
df_hotels.dropna(inplace=True)
print("Shape :", df_hotels.shape)
print()

display(df_hotels.head())
print()

print("Number of cities with available hotels : ", df_hotels.city.nunique())

Shape : (2031, 6)



Unnamed: 0,city,name,stars,rating,price,link
0,Aigues Mortes,Hotel Canal Aigues Mortes,3 out of 5,8.5,€ 491,https://www.booking.com/hotel/fr/canal-aigues-...
1,Aigues Mortes,Maison Diderot,3 out of 5,9.5,€ 954,https://www.booking.com/hotel/fr/maison-didero...
2,Aigues Mortes,Hôtel Saint Louis,3 out of 5,8.4,€ 681,https://www.booking.com/hotel/fr/saint-louis-a...
3,Aigues Mortes,Hôtel Le Médiéval,2 out of 5,8.6,€ 618,https://www.booking.com/hotel/fr/le-medieval.e...
4,Aigues Mortes,Noemys Aigues-Mortes - ex Mona Lisa Royal Hôtel,2 out of 5,7.0,€ 414,https://www.booking.com/hotel/fr/le-royal-hote...



Number of cities with available hotels :  35


In [16]:
df_hotels["rating"] = df_hotels["rating"].astype(float)
df_hotels["price"] = df_hotels["price"].apply(lambda x : x.split()[1]).apply(lambda x : x.replace(",","")).astype(int)
df_hotels["stars"] = df_hotels["stars"].apply(lambda x : x.split()[0]).astype(int)
display(df_hotels.head())
print()

print("Number of hotels available per city")
df_hotels.city.value_counts().sort_values().head()

Unnamed: 0,city,name,stars,rating,price,link
0,Aigues Mortes,Hotel Canal Aigues Mortes,3,8.5,491,https://www.booking.com/hotel/fr/canal-aigues-...
1,Aigues Mortes,Maison Diderot,3,9.5,954,https://www.booking.com/hotel/fr/maison-didero...
2,Aigues Mortes,Hôtel Saint Louis,3,8.4,681,https://www.booking.com/hotel/fr/saint-louis-a...
3,Aigues Mortes,Hôtel Le Médiéval,2,8.6,618,https://www.booking.com/hotel/fr/le-medieval.e...
4,Aigues Mortes,Noemys Aigues-Mortes - ex Mona Lisa Royal Hôtel,2,7.0,414,https://www.booking.com/hotel/fr/le-royal-hote...



Number of hotels available per city


Montauban          21
Aix en Provence    26
Besancon           27
Bayeux             29
Amiens             32
Name: city, dtype: int64

In [17]:
# Top 20 best hotels for recommended destinations (based on the star ratings, reviews and prices)
df_hotels = df_hotels.sort_values(["city", "rating", "price", "stars"],
                                  ascending=[True, False, False, False])

df_chosen_hotels = df_hotels.groupby("city", as_index=False).head(20)
df_chosen_hotels.reset_index(drop=True, inplace=True)
df_chosen_hotels.head()

Unnamed: 0,city,name,stars,rating,price,link
0,Aigues Mortes,L'Aube Ensoleillée entre Plage et Pinède,4,10.0,500,https://www.booking.com/hotel/fr/sunny-dawn.en...
1,Aigues Mortes,Marcelle en Camargue,4,9.9,858,https://www.booking.com/hotel/fr/marcelle-en-c...
2,Aigues Mortes,Mazet du pêcheur,4,9.7,1453,https://www.booking.com/hotel/fr/mazet-du-pech...
3,Aigues Mortes,Boutique Hôtel des Remparts & Spa,5,9.5,1943,https://www.booking.com/hotel/fr/les-remparts-...
4,Aigues Mortes,Maison Diderot,3,9.5,954,https://www.booking.com/hotel/fr/maison-didero...


### Get hotels informations for best destinations

In [18]:
hotels_info = []

for i in range(len(df_chosen_hotels)):
    url = df_chosen_hotels.iloc[i,5]

    r_hotels = requests.get(url)
    soup = BeautifulSoup(r_hotels.text)
    hotels_info_dic = {}
    hotels_info_dic["city"] = df_chosen_hotels.iloc[i,0]
    hotels_info_dic["hotel_names"] = df_chosen_hotels.iloc[i,1]
    hotels_info_dic["stars"] = df_chosen_hotels.iloc[i,2]
    hotels_info_dic["rating"] = df_chosen_hotels.iloc[i,3]
    hotels_info_dic["price"] = df_chosen_hotels.iloc[i,4]
    hotels_info_dic["hotel_lat"] = float(soup.find("a", attrs={"id" : "hotel_address"})["data-atlas-latlng"].split(",")[0])
    hotels_info_dic["hotel_lon"] = float(soup.find("a", attrs={"id" : "hotel_address"})["data-atlas-latlng"].split(",")[1])
    hotels_info_dic["adresses"] = soup.find("span", attrs={"class" : "hp_address_subtitle"}).text.strip()
    hotels_info_dic["descriptions"] = soup.find("div", attrs={"id" : "property_description_content"}).text.split("\n")[2].strip()
    hotels_info_dic["link"] = df_chosen_hotels.iloc[i,5]

    hotels_info.append(hotels_info_dic)

In [19]:
df_hotels_info = pd.DataFrame(hotels_info)
df_hotels_info.head()

Unnamed: 0,city,hotel_names,stars,rating,price,hotel_lat,hotel_lon,adresses,descriptions,link
0,Aigues Mortes,L'Aube Ensoleillée entre Plage et Pinède,4,10.0,500,43.550511,4.12597,"1140 Avenue de la Pinède Résidence L'Aube, 302...","The apartment has 2 bedrooms, a kitchen with f...",https://www.booking.com/hotel/fr/sunny-dawn.en...
1,Aigues Mortes,Marcelle en Camargue,4,9.9,858,43.566156,4.192345,"40 Rue Pasteur, 30220 Aigues-Mortes, France","At the guest house, rooms are equipped with a ...",https://www.booking.com/hotel/fr/marcelle-en-c...
2,Aigues Mortes,Mazet du pêcheur,4,9.7,1453,43.571975,4.220203,"Les Courèges, 30220 Saint-Laurent-dʼAigouze, F...","This holiday home is fitted with 3 bedrooms, a...",https://www.booking.com/hotel/fr/mazet-du-pech...
3,Aigues Mortes,Boutique Hôtel des Remparts & Spa,5,9.5,1943,43.568036,4.190344,"6, Place Anatole France, 30220 Aigues-Mortes, ...",Located in a former military station dating fr...,https://www.booking.com/hotel/fr/les-remparts-...
4,Aigues Mortes,Maison Diderot,3,9.5,954,43.567637,4.192471,"7 Boulevard Diderot, 30220 Aigues-Mortes, France","This holiday home is fitted with 4 bedrooms, a...",https://www.booking.com/hotel/fr/maison-didero...


In [20]:
# Export weather informations to a .csv file
df_hotels_info.to_csv("hotels_info.csv", index=False)

In [21]:
# Map the top 20 hotels per selected cities
for city in df_agg.city.iloc[0:5].tolist():
    fig2 = px.scatter_mapbox(df_hotels_info.loc[df_hotels_info.city==city, :], 
                         lat="hotel_lat", 
                         lon="hotel_lon", 
                         color="rating",
                         size="rating",
                         title = f"Selected hotels for {city}", 
                         width=1000, 
                         height=600,
                         mapbox_style="carto-positron", 
                         zoom = 8)
    fig3 = px.scatter_mapbox(df_agg.loc[df_agg.city==city, :], 
                         lat="lat", 
                         lon="lon")
    
    fig_final = fig2.add_trace(fig3.data[0])
    fig_final.show()

In [22]:
# Aggregate hotels' data per city
df_hotels_info_agg = df_hotels_info.groupby("city", as_index=False).agg(lambda x: list(x))
print(df_hotels_info_agg.shape)
display(df_hotels_info_agg.head())

(35, 10)


Unnamed: 0,city,hotel_names,stars,rating,price,hotel_lat,hotel_lon,adresses,descriptions,link
0,Aigues Mortes,"[L'Aube Ensoleillée entre Plage et Pinède, Mar...","[4, 4, 4, 5, 3, 4, 3, 5, 3, 3, 3, 3, 3, 3, 2, ...","[10.0, 9.9, 9.7, 9.5, 9.5, 9.5, 9.2, 9.1, 9.1,...","[500, 858, 1453, 1943, 954, 680, 608, 1353, 56...","[43.5505113, 43.5661556, 43.5719752, 43.568035...","[4.1259703, 4.192345, 4.2202027, 4.1903438, 4....","[1140 Avenue de la Pinède Résidence L'Aube, 30...","[The apartment has 2 bedrooms, a kitchen with ...",[https://www.booking.com/hotel/fr/sunny-dawn.e...
1,Aix en Provence,"[Pavillon de Beauregard, Château de la Gaude, ...","[3, 5, 3, 5, 4, 4, 4, 3, 4, 4, 3, 4, 3, 4, 3, ...","[9.5, 9.0, 9.0, 8.9, 8.4, 8.4, 8.4, 8.2, 8.2, ...","[1047, 3586, 420, 2363, 1102, 632, 632, 508, 4...","[43.5445128, 43.56879, 43.61781771, 43.5211698...","[5.4681052, 5.47851, 5.44888895, 5.49659729, 5...","[1541 chemin de Beauregard, 13100 Aix-en-Prove...","[Offering an outdoor pool and a garden, Pavill...",[https://www.booking.com/hotel/fr/pavillon-de-...
2,Amiens,[AMIENS: SUPER STUDIO COCOONING - 2 min DE LA ...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[9.5, 9.0, 9.0, 9.0, 8.7, 8.7, 8.6, 8.5, 8.5, ...","[336, 305, 222, 220, 629, 326, 1348, 413, 384,...","[49.88979237, 49.902548, 49.902548, 49.9065269...","[2.30541916, 2.276519, 2.276519, 2.2692149, 2....","[12 Boulevard de Belfort, 80000 Amiens, France...","[This apartment features 1 bedroom, a flat-scr...",[https://www.booking.com/hotel/fr/amiens-super...
3,Annecy,[Kiss Cool - 2 bedroom apartment with terrace ...,"[4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 3, 3, ...","[10.0, 9.2, 9.0, 8.8, 8.8, 8.7, 8.7, 8.7, 8.7,...","[1096, 1692, 756, 1483, 592, 1168, 943, 656, 5...","[45.9076295, 45.8983616, 45.9055642, 45.902643...","[6.1322113, 6.1246552, 6.1308521, 6.1268332, 6...","[7 rue Thomas Ruphy, 74000 Annecy, France, 17 ...",[The apartment with a balcony and city views h...,[https://www.booking.com/hotel/fr/kiss-cool-ap...
4,Ariege,"[Château de Sibra, Les appartements de Clélia,...","[3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[10.0, 9.8, 9.7, 9.7, 9.7, 9.7, 9.5, 9.5, 9.5,...","[942, 368, 545, 520, 473, 373, 604, 460, 378, ...","[43.04315926, 42.72085404, 42.7292549, 43.1006...","[1.92457817, 1.83361059, 1.8440466, 1.87055, 1...","[Château de Sibra 95 Sibra Hameau, 09500 Lagar...","[The daily breakfast offers à la carte, contin...",[https://www.booking.com/hotel/fr/chateau-de-s...


Store the data on an S3 bucket

In [23]:
aws_access_key_id = input("Please, enter your access key")
aws_secret_access_key = input("Please, enter your secret access key")

In [24]:
session = boto3.Session(aws_access_key_id=aws_access_key_id, 
                        aws_secret_access_key=aws_secret_access_key)

s3 = session.resource("s3")
bucket = s3.create_bucket(Bucket="jedha-certification-kayak-project")

# Store the file in the S3 bucket
bucket.put_object(Key="hotels_infos.csv", Body=df_hotels_info_agg.to_csv(index=False))
bucket.put_object(Key="weather_infos.csv", Body=df_agg.to_csv(index=False))

s3.Object(bucket_name='jedha-certification-kayak-project', key='weather_infos.csv')

Create an engine

In [None]:
# Get data from the s3 bucket
obj = s3.Object("jedha-certification-kayak-project", "hotels_infos.csv")

data=obj.get()["Body"].read()
df_get = pd.read_csv(io.BytesIO(data))
print(data)
df_get.head(1)

In [None]:
DBUSER = input("Please, enter your username")
DBPASS = input("Please, enter your password")

# creates the engine to establish connection between the database and python
engine = create_engine(f"mysql+pymysql://{DBUSER}:{DBPASS}@kayak-db.cgrb10cco1po.eu-west-3.rds.amazonaws.com:3306/", echo=True)

engine.execute("CREATE DATABASE weather_dbase")
engine.execute("USE weather_dbase")

In [None]:
# Store data in the database data and sends it to db
df_get.to_sql("hotels_infos", engine, if_exists = "replace", index= False)

In [28]:
list_hotels = pd.read_sql("SELECT hotel_names FROM hotels_infos WHERE city='Paris'", engine).iloc[0,0]

2023-05-04 01:03:33,178 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s


2023-05-04 01:03:33 [sqlalchemy.engine.Engine] INFO: SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s


2023-05-04 01:03:33,178 INFO sqlalchemy.engine.Engine [cached since 8.871s ago] {'table_schema': 'None', 'table_name': "SELECT hotel_names FROM hotels_infos WHERE city='Paris'"}


2023-05-04 01:03:33 [sqlalchemy.engine.Engine] INFO: [cached since 8.871s ago] {'table_schema': 'None', 'table_name': "SELECT hotel_names FROM hotels_infos WHERE city='Paris'"}


2023-05-04 01:03:33,198 INFO sqlalchemy.engine.Engine SELECT hotel_names FROM hotels_infos WHERE city='Paris'


2023-05-04 01:03:33 [sqlalchemy.engine.Engine] INFO: SELECT hotel_names FROM hotels_infos WHERE city='Paris'


2023-05-04 01:03:33,199 INFO sqlalchemy.engine.Engine [raw sql] {}


2023-05-04 01:03:33 [sqlalchemy.engine.Engine] INFO: [raw sql] {}


In [None]:
print(list_hotels)