# Load

In [62]:
import pandas as pd
from sqlalchemy import create_engine
from azure.storage.blob import BlobServiceClient
import unicodedata

## Data Import from Blob Storage

In [63]:
# Azure Blob Storage
connection_string = "DefaultEndpointsProtocol=https;AccountName=datalakestoragerentscape;AccountKey=w6Edf3np1A18vQIei31unvKWjGpyDUBqexvVauAwCeqOmnF1Bq7WsIEVplSEW+hT0q4ZzDi2KNh4+AStrOcI6g==;EndpointSuffix=core.windows.net"
container_name = "cleansed-layer-airbnb"

In [47]:
try:
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    print("Connected to Azure Blob Storage.")
except Exception as e:
    print("Failed to connect to Azure Blob Storage:", e)

Connected to Azure Blob Storage.


In [48]:
blob_list = container_client.list_blobs()
print(container_name)
for blob in blob_list:
    print(f"+---{blob.name}")

cleansed-layer-airbnb
+---cities_amenities
+---cities_listings.csv
+---cities_neighbourhood_features.csv
+---cities_relevant_features_classification.csv
+---cities_reviews.csv
+---city_bcn_distances
+---city_prg_distances


In [49]:
# Function to load a CSV file from Azure Blob Storage into a Pandas DataFrame
def load_csv_from_blob(blob_path):
    blob_client = container_client.get_blob_client(blob_path)
    stream = BytesIO(blob_client.download_blob().readall())
    return pd.read_csv(stream)

In [50]:
# Load the Prague and Barcelona listings
try:
    cities_amenities = load_csv_from_blob("cities_amenities")
    cities_listings = load_csv_from_blob("cities_listings.csv")
    cities_neighbourhood_features = load_csv_from_blob("cities_neighbourhood_features.csv")
    cities_relevant_features_classification = load_csv_from_blob("cities_relevant_features_classification.csv")
    cities_reviews = load_csv_from_blob("cities_reviews.csv")
    city_bcn_distances = load_csv_from_blob("city_bcn_distances")
    city_prg_distances = load_csv_from_blob("city_prg_distances")
    print("Data loaded successfully!")
except Exception as e:
    print("Failed to load data from Azure Blob Storage:", e)

  return pd.read_csv(stream)


Data loaded successfully!


## Checking column names and dtypes for creating SQL Tables

In [51]:
# Store all dataframes in a dictionary for easy iteration
dataframes = {
    "cities_amenities": cities_amenities,
    "cities_listings": cities_listings,
    "cities_neighbourhood_features": cities_neighbourhood_features,
    "cities_relevant_features_classification": cities_relevant_features_classification,
    "cities_reviews": cities_reviews,
    "city_bcn_distances": city_bcn_distances,
    "city_prg_distances": city_prg_distances
}

# Iterate over each dataframe and print column names with their types
for df_name, df in dataframes.items():
    print(f"Dataframe: {df_name}")
    print(df.dtypes)  # Prints the column types, along with column names
    print()  # Blank line for readability


Dataframe: cities_amenities
id                       int64
restaurant_count         int64
hotel_count              int64
museum_count             int64
shopping_center_count    int64
dtype: object

Dataframe: cities_listings
id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
source                                           object
                                                 ...   
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
city                                             object
Length: 76, dtype: object

Dataframe: cities_neighbourhood_features
unique_neighbourhoods    object
exposure_score            int64
rising_star              ob

### CREATE TABLE - SQL Scripts

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.cities_amenities;

-- Recreate the table with `id` as BIGINT
CREATE TABLE dbo.cities_amenities (
    id BIGINT NOT NULL PRIMARY KEY,
    restaurant_count INT NULL,
    hotel_count INT NULL,
    museum_count INT NULL,
    shopping_center_count INT NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.cities_listings;

-- Recreate the table with `id` as BIGINT
CREATE TABLE dbo.cities_listings (
    id BIGINT NOT NULL PRIMARY KEY,
    listing_url NVARCHAR(MAX) NULL,
    scrape_id BIGINT NULL,
    last_scraped NVARCHAR(50) NULL,
    source NVARCHAR(50) NULL,
    name NVARCHAR(MAX) NULL,
    description NVARCHAR(MAX) NULL,
    neighborhood_overview NVARCHAR(MAX) NULL,
    picture_url NVARCHAR(MAX) NULL,
    host_id BIGINT NULL,
    host_url NVARCHAR(MAX) NULL,
    host_name NVARCHAR(255) NULL,
    host_since NVARCHAR(50) NULL,
    host_location NVARCHAR(MAX) NULL,
    host_about NVARCHAR(MAX) NULL,
    host_response_time NVARCHAR(50) NULL,
    host_response_rate NVARCHAR(50) NULL,
    host_acceptance_rate NVARCHAR(50) NULL,
    host_is_superhost BIT NULL,
    host_thumbnail_url NVARCHAR(MAX) NULL,
    host_picture_url NVARCHAR(MAX) NULL,
    host_neighbourhood NVARCHAR(MAX) NULL,
    host_listings_count INT NULL,
    host_total_listings_count INT NULL,
    host_verifications NVARCHAR(MAX) NULL,
    host_has_profile_pic BIT NULL,
    host_identity_verified BIT NULL,
    neighbourhood NVARCHAR(MAX) NULL,
    neighbourhood_cleansed NVARCHAR(MAX) NULL,
    neighbourhood_group_cleansed NVARCHAR(MAX) NULL,
    latitude FLOAT NULL,
    longitude FLOAT NULL,
    property_type NVARCHAR(255) NULL,
    room_type NVARCHAR(255) NULL,
    accommodates INT NULL,
    bathrooms FLOAT NULL,
    bathrooms_text NVARCHAR(255) NULL,
    bedrooms FLOAT NULL,
    beds FLOAT NULL,
    amenities NVARCHAR(MAX) NULL,
    price FLOAT NULL,
    minimum_nights INT NULL,
    maximum_nights INT NULL,
    minimum_minimum_nights INT NULL,
    maximum_minimum_nights INT NULL,
    minimum_maximum_nights INT NULL,
    maximum_maximum_nights INT NULL,
    minimum_nights_avg_ntm FLOAT NULL,
    maximum_nights_avg_ntm FLOAT NULL,
    calendar_updated NVARCHAR(50) NULL,
    has_availability BIT NULL,
    availability_30 INT NULL,
    availability_60 INT NULL,
    availability_90 INT NULL,
    availability_365 INT NULL,
    calendar_last_scraped NVARCHAR(50) NULL,
    number_of_reviews INT NULL,
    number_of_reviews_ltm INT NULL,
    number_of_reviews_l30d INT NULL,
    first_review NVARCHAR(50) NULL,
    last_review NVARCHAR(50) NULL,
    review_scores_rating FLOAT NULL,
    review_scores_accuracy FLOAT NULL,
    review_scores_cleanliness FLOAT NULL,
    review_scores_checkin FLOAT NULL,
    review_scores_communication FLOAT NULL,
    review_scores_location FLOAT NULL,
    review_scores_value FLOAT NULL,
    license NVARCHAR(255) NULL,
    instant_bookable BIT NULL,
    calculated_host_listings_count INT NULL,
    calculated_host_listings_count_entire_homes INT NULL,
    calculated_host_listings_count_private_rooms INT NULL,
    calculated_host_listings_count_shared_rooms INT NULL,
    reviews_per_month FLOAT NULL,
    city NVARCHAR(255) NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.cities_neighbourhood_features;

-- Recreate the table
CREATE TABLE dbo.cities_neighbourhood_features (
    unique_neighbourhoods NVARCHAR(255) NOT NULL PRIMARY KEY,
    exposure_score INT NULL,
    rising_star NVARCHAR(50) NULL,
    regulations NVARCHAR(MAX) NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.cities_neighbourhood_features;

-- Recreate the table
CREATE TABLE dbo.cities_neighbourhood_features (
    unique_neighbourhoods NVARCHAR(255) NOT NULL PRIMARY KEY,
    exposure_score INT NULL,
    rising_star NVARCHAR(50) NULL,
    regulations NVARCHAR(MAX) NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.cities_reviews;

-- Recreate the table with `id` as BIGINT
CREATE TABLE dbo.cities_reviews (
    listing_id BIGINT NOT NULL,
    id BIGINT NOT NULL PRIMARY KEY,
    date NVARCHAR(50) NULL,
    reviewer_id BIGINT NULL,
    reviewer_name NVARCHAR(255) NULL,
    comments NVARCHAR(MAX) NULL,
    city NVARCHAR(255) NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.city_bcn_distances;

-- Recreate the table with `id` as BIGINT
CREATE TABLE dbo.city_bcn_distances (
    id BIGINT NOT NULL PRIMARY KEY,
    dist_to_sagrada_familia_km FLOAT NULL,
    dist_to_casa_mila_km FLOAT NULL,
    dist_to_casa_batllo_km FLOAT NULL,
    dist_to_parc_guell_km FLOAT NULL,
    dist_to_la_rambla_km FLOAT NULL,
    dist_to_montjuic_km FLOAT NULL
);

-- Drop the table if it exists
DROP TABLE IF EXISTS dbo.city_prg_distances;

-- Recreate the table with `id` as BIGINT
CREATE TABLE dbo.city_prg_distances (
    id BIGINT NOT NULL PRIMARY KEY,
    dist_to_prague_castle_km FLOAT NULL,
    dist_to_charles_bridge_km FLOAT NULL,
    dist_to_old_town_square_km FLOAT NULL,
    dist_to_st_vitus_cathedral_km FLOAT NULL,
    dist_to_vysehrad_km FLOAT NULL,
    dist_to_petrin_tower_km FLOAT NULL
);

# Data Load to SQL AZURE DB

In [64]:
# Azure SQL connection details
server = 'servidorparadatalakes.database.windows.net'
database = 'datalakeshslu@outlook.com'
username = 'datalakeshslu@outlook.com@servidorparadatalakes'
password = 'victordanielrodigo.1'  # Replace with your actual password

# SQLAlchemy connection string
connection_string = f"mssql+pymssql://{username}:{password}@{server}/{database}"

# Create the SQLAlchemy engine
try:
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Test the connection
    with engine.connect() as conn:
        print("Connection to the database was successful!")
except Exception as e:
    print(f"Error creating SQLAlchemy engine or connecting: {e}")


SQLAlchemy engine created successfully!
Connection to the database was successful!


In [57]:
# Insert DataFrame into the database
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    cities_amenities.to_sql(name='cities_amenities', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'cities_amenities' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")

SQLAlchemy engine created successfully!
Data inserted successfully into 'cities_amenities' table!
Database connection closed.


In [59]:
# Insert DataFrame into the database
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    cities_listings.to_sql(name='cities_listings', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'cities_listings' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")

SQLAlchemy engine created successfully!
Data inserted successfully into 'cities_listings' table!
Database connection closed.


In [66]:
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    cities_neighbourhood_features.to_sql(name='cities_neighbourhood_features', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'cities_neighbourhood_features' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")


SQLAlchemy engine created successfully!
Data inserted successfully into 'cities_neighbourhood_features' table!
Database connection closed.


In [67]:
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    cities_relevant_features_classification.to_sql(name='cities_relevant_features_classification', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'cities_relevant_features_classification' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")


SQLAlchemy engine created successfully!
Data inserted successfully into 'cities_relevant_features_classification' table!
Database connection closed.


In [79]:
cities_reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,city
0,23163,101588,2010-09-20,227165,Nathan,Incredible apartment in an ideal location. The...,Prague
1,23163,157152,2010-12-22,286036,Hugh,"The apartment was huge, we felt like we were s...",Prague
2,23163,1512585,2012-06-18,2634312,Sergey,"Отличная квартира и сервис, большое спасибо.",Prague
3,23163,1871730,2012-08-02,2538194,Sônia,Great place to combine the good things of an a...,Prague
4,23163,24542545,2014-12-30,21860409,Rachel,Residence Karolina and Prague City Apartments ...,Prague
...,...,...,...,...,...,...,...
1578928,1222289574891792402,1225948271188658620,2024-08-18,594356493,Christian,"muy agradable la estancia, las personas que no...",Barcelona
1578929,1222305480654104141,1225952926997345716,2024-08-18,589580500,Ornella Gianina,"Buena ubicación, habitación un poco pequeña, p...",Barcelona
1578930,1226644265166272585,1237537410696005705,2024-09-03,395263368,Sandra,Incroyable,Barcelona
1578931,1227947729562300326,1228946929992698903,2024-08-22,29612403,Avelina,Cozy and spacious room with a beautiful view. ...,Barcelona


In [80]:
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table in batches of 100,000 rows
    batch_size = 100000
    cities_reviews.to_sql(
        name='cities_reviews',        # Name of the SQL table
        con=engine,                   # SQLAlchemy engine
        schema='dbo',                 # Schema (use 'dbo' for default)
        if_exists='append',           # Append data if the table already exists
        index=False,                  # Do not include the DataFrame index as a column
        chunksize=batch_size          # Number of rows per batch
    )

    print(f"Data inserted successfully into 'cities_reviews' table in batches of {batch_size} rows!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")


SQLAlchemy engine created successfully!
Data inserted successfully into 'cities_reviews' table in batches of 100000 rows!
Database connection closed.


In [69]:
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    city_bcn_distances.to_sql(name='city_bcn_distances', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'city_bcn_distances' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")


SQLAlchemy engine created successfully!
Data inserted successfully into 'city_bcn_distances' table!
Database connection closed.


In [75]:
try:
    # Create the SQLAlchemy engine
    engine = create_engine(connection_string)
    print("SQLAlchemy engine created successfully!")

    # Insert the DataFrame into the SQL table
    city_prg_distances.to_sql(name='city_prg_distances', con=engine, schema='dbo', if_exists='append', index=False)

    print("Data inserted successfully into 'city_prg_distances' table!")
except Exception as e:
    print(f"Error: {e}")
finally:
    # Dispose of the engine to close the connection
    if 'engine' in locals():
        engine.dispose()
        print("Database connection closed.")


SQLAlchemy engine created successfully!
Data inserted successfully into 'city_prg_distances' table!
Database connection closed.
