# Transform

In [1]:
import requests
import pandas as pd
import os
import math
from time import sleep
from azure.storage.blob import BlobServiceClient
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import unicodedata

## Data Import from Blob Storage

In [2]:
# Azure Blob Storage
connection_string = "DefaultEndpointsProtocol=https;AccountName=datalakestoragerentscape;AccountKey=w6Edf3np1A18vQIei31unvKWjGpyDUBqexvVauAwCeqOmnF1Bq7WsIEVplSEW+hT0q4ZzDi2KNh4+AStrOcI6g==;EndpointSuffix=core.windows.net"
container_name = "mapsapi-rentscape-blob"

In [3]:
try:
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    print("Connected to Azure Blob Storage.")
except Exception as e:
    print("Failed to connect to Azure Blob Storage:", e)

Connected to Azure Blob Storage.


In [4]:
blob_list = container_client.list_blobs()
print(container_name)
for blob in blob_list:
    print(f"+---{blob.name}")

mapsapi-rentscape-blob
+---bcn_places.csv
+---prg_places.csv


In [6]:
# Function to load a CSV file from Azure Blob Storage into a Pandas DataFrame
def load_csv_from_blob(blob_path):
    blob_client = container_client.get_blob_client(blob_path)
    stream = BytesIO(blob_client.download_blob().readall())
    return pd.read_csv(stream)

In [12]:
# Load the Prague and Barcelona listings
try:
    bcn_places = load_csv_from_blob("bcn_places.csv")
    prg_places = load_csv_from_blob("prg_places.csv")
    print("Data loaded successfully!")
except Exception as e:
    print("Failed to load data from Azure Blob Storage:", e)

Data loaded successfully!


## Data Transformation

In [24]:
city_bcn_distances = bcn_places.copy()
city_bcn_amenities = bcn_places.copy()

city_prg_distances = prg_places.copy()
city_prg_amenities = prg_places.copy()

city_bcn_distances = city_bcn_distances[['id', 'dist_to_sagrada_familia', 'dist_to_casa_milà', 'dist_to_casa_batlló' , 'dist_to_parc_güell', 'dist_to_la_rambla', 'dist_to_montjuïc']]
city_bcn_amenities = city_bcn_amenities[['id', 'restaurant_count', 'hotel_count', 'museum_count' , 'shopping_center_count']]

city_prg_distances = city_prg_distances[['id', 'dist_to_prague_castle', 'dist_to_charles_bridge', 'dist_to_old_town_square' , 'dist_to_st._vitus_cathedral', 'dist_to_vysehrad', 'dist_to_petrin_tower']]
city_prg_amenities = city_prg_amenities[['id', 'restaurant_count', 'hotel_count', 'museum_count' , 'shopping_center_count']]

cities_amenities = pd.concat([city_bcn_amenities, city_prg_amenities], ignore_index=True)

## Cities Amentites

Don't needed, any changes only to append both together and remove park_count column that was irrelevant.

In [27]:
cities_amenities

Unnamed: 0,id,restaurant_count,hotel_count,museum_count,shopping_center_count
0,18674,444,18,2,0
1,23197,30,3,0,0
2,32711,460,18,1,0
3,34241,617,128,26,3
4,34981,591,106,24,3
...,...,...,...,...,...
28543,38023891,421,160,26,0
28544,38044675,421,160,26,0
28545,38056289,18,1,0,0
28546,38062553,397,108,32,0


## Distances Data Transformation

In [28]:
def normalize_column_name(col_name):
    # Normalize the column name to remove accents and diacritic marks
    nfkd_form = unicodedata.normalize('NFKD', col_name)
    # Keep only the non-combining characters (ASCII equivalent)
    ascii_name = "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return ascii_name

## PRG - Distances

In [25]:

# Store original and new column names so we know which columns get renamed
old_columns = city_prg_distances.columns
new_columns = []

for col in old_columns:
    # Normalize column name (remove special chars)
    col_normalized = normalize_column_name(col)
    
    # If column starts with 'dist_to_', append '_km'
    if col_normalized.startswith('dist_to_'):
        col_normalized = col_normalized + '_km'
    new_columns.append(col_normalized)

# Assign the new column names
city_prg_distances.columns = new_columns

# Round all dist_to_*_km columns to two decimals
dist_columns = [col for col in city_prg_distances.columns if col.startswith('dist_to_')]

for col in dist_columns:
    city_prg_distances[col] = city_prg_distances[col].round(2)


In [26]:
city_prg_distances

Unnamed: 0,id,dist_to_prague_castle_km,dist_to_charles_bridge_km,dist_to_old_town_square_km,dist_to_st._vitus_cathedral_km,dist_to_vysehrad_km,dist_to_petrin_tower_km
0,23163,1.39,0.55,0.70,1.35,2.00,1.41
1,23169,1.60,0.83,0.15,1.57,2.67,1.99
2,26755,2.26,1.46,0.76,2.22,2.71,2.60
3,30762,1.85,1.07,0.37,1.82,2.70,2.23
4,42514,1.28,0.49,0.80,1.24,2.03,1.26
...,...,...,...,...,...,...,...
9061,38023891,2.36,1.48,1.11,2.31,1.69,2.35
9062,38044675,2.36,1.48,1.11,2.31,1.69,2.35
9063,38056289,7.91,7.48,6.86,7.90,8.78,8.58
9064,38062553,1.92,1.21,0.54,1.89,2.94,2.37


In [29]:
# Store original and new column names so we know which columns get renamed
old_columns = city_bcn_distances.columns
new_columns = []

for col in old_columns:
    # Normalize column name (remove special chars)
    col_normalized = normalize_column_name(col)
    
    # If column starts with 'dist_to_', append '_km'
    if col_normalized.startswith('dist_to_'):
        col_normalized = col_normalized + '_km'
    new_columns.append(col_normalized)

# Assign the new column names
city_bcn_distances.columns = new_columns

# Round all dist_to_*_km columns to two decimals
dist_columns = [col for col in city_bcn_distances.columns if col.startswith('dist_to_')]

for col in dist_columns:
    city_bcn_distances[col] = city_bcn_distances[col].round(2)


In [30]:
city_bcn_distances

Unnamed: 0,id,dist_to_sagrada_familia_km,dist_to_casa_mila_km,dist_to_casa_batllo_km,dist_to_parc_guell_km,dist_to_la_rambla_km,dist_to_montjuic_km
0,18674,0.26,1.45,1.67,1.94,2.64,4.97
1,23197,3.91,5.19,5.12,5.60,5.21,7.83
2,32711,0.42,1.34,1.61,1.76,2.66,4.91
3,34241,2.56,1.97,1.50,4.21,0.26,2.69
4,34981,2.65,2.10,1.63,4.33,0.38,2.69
...,...,...,...,...,...,...,...
19477,1031058238634065003,1.84,1.22,0.76,3.43,0.60,3.08
19478,1025437767908153304,2.46,1.21,1.49,2.26,2.53,3.47
19479,1025633790000062773,4.21,2.82,2.72,4.40,2.81,1.74
19480,1025643602289861008,4.19,2.81,2.71,4.38,2.81,1.77


## Load to Cleansed Layer Airbnb

In [33]:
# SAS token and container URL
sas_token = "sp=racwdli&st=2024-12-11T22:47:10Z&se=2025-02-21T06:47:10Z&spr=https&sv=2022-11-02&sr=c&sig=mcDsrO1XhrkgXG2oDA7pkdbHE1sCEYHHMyU%2BnEvpNV8%3D"
container_url = "https://datalakestoragerentscape.blob.core.windows.net"

def upload_dataframe_to_blob(df, blob_name):
    # Save DataFrame to a local CSV file
    file_path = f"{blob_name}.csv"
    df.to_csv(file_path, index=False)

    # Connect to the Blob Container
    blob_service_client = BlobServiceClient(account_url=container_url, credential=sas_token)
    container_client = blob_service_client.get_container_client("cleansed-layer-airbnb")

    # Upload the CSV file to the Blob Storage
    with open(file_path, "rb") as data:
        container_client.upload_blob(name=blob_name, data=data, overwrite=True)
    
    print(f"Uploaded {blob_name} to {container_url}/cleansed-layer-airbnb")


# Upload them
upload_dataframe_to_blob(cities_amenities, "cities_amenities")
upload_dataframe_to_blob(city_bcn_distances, "city_bcn_distances")
upload_dataframe_to_blob(city_prg_distances, "city_prg_distances")


Uploaded cities_amenities to https://datalakestoragerentscape.blob.core.windows.net/cleansed-layer-airbnb
Uploaded city_bcn_distances to https://datalakestoragerentscape.blob.core.windows.net/cleansed-layer-airbnb
Uploaded city_prg_distances to https://datalakestoragerentscape.blob.core.windows.net/cleansed-layer-airbnb
