In [2]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# SAS Token and Container URL
sas_token = "sp=racwdl&st=2024-12-01T01:41:21Z&se=2025-01-28T09:41:21Z&spr=https&sv=2022-11-02&sr=c&sig=OpjDBOtKtITxiEAnzF7wkFAvAyLCmijQLyz93sxQi78%3D"
container_url = "https://datalakestoragerentscape.blob.core.windows.net"

# Initialize BlobServiceClient
blob_service_client = BlobServiceClient(account_url=container_url, credential=sas_token)

# Get ContainerClient
container_client = blob_service_client.get_container_client(container="openai-rentscape-blob")

# Function to download blob and load it into a DataFrame
def download_and_load_blob(blob_name):
    try:
        blob_client = container_client.get_blob_client(blob_name)
        blob_data = blob_client.download_blob().readall()
        
        # Determine file type and load appropriately
        if blob_name.endswith('.csv'):
            df = pd.read_csv(io.StringIO(blob_data.decode('utf-8')))
        elif blob_name.endswith('.parquet'):
            df = pd.read_parquet(io.BytesIO(blob_data))
        else:
            print(f"Skipping unsupported file type: {blob_name}")
            return
        
        # Generate a variable name using only the file name (remove directory path)
        variable_name = os.path.splitext(os.path.basename(blob_name))[0]  # Remove extension and path
        
        # Set the DataFrame to a global variable with this name
        globals()[variable_name] = df
        print(f"{variable_name}")
    except Exception as e:
        print(f"Failed to process {blob_name}: {e}")

# List all blobs in the container
blob_names = [blob.name for blob in container_client.list_blobs()]

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks for each blob
    futures = {executor.submit(download_and_load_blob, blob_name): blob_name for blob_name in blob_names}
    
    # Process results as they complete
    for future in as_completed(futures):
        future.result()  # Ensures all processing completes

# After this, each file in the blob will be loaded as a DataFrame
# The variable names will correspond to just the file names (excluding extensions).



barcelona_neighborhoods_details
prague_neighborhoods_details
prague_listings
barcelona_listings


In [4]:
# Function to split features and rename columns with a prefix, preserving the original ID column
def split_and_rename_features_with_id(df, id_column_index, feature_column_index, prefix):
    # Split the specified feature column into separate columns by the ";" separator
    df_split = df.iloc[:, feature_column_index].str.split(";", expand=True)
    
    # Rename the new columns with the specified prefix
    df_split.columns = [f"{prefix}_feature_{i+1}" for i in range(df_split.shape[1])]
    
    # Preserve the original ID column
    id_column = df.iloc[:, id_column_index]
    
    # Concatenate the ID column with the new split columns
    df_result = pd.concat([id_column, df_split], axis=1)
    return df_result

# Process 'barcelona_listings' and 'prague_listings' DataFrames
if 'barcelona_listings' in globals():
    barcelona_listings_split_with_id = split_and_rename_features_with_id(barcelona_listings, 0, 1, "description")

if 'prague_listings' in globals():
    prague_listings_split_with_id = split_and_rename_features_with_id(prague_listings, 0, 1, "description")

In [5]:
barcelona_listings_split_with_id

Unnamed: 0,id,description_feature_1,description_feature_2,description_feature_3
0,18674,Near major landmarks,Family-friendly,Quiet area
1,23197,Central location,Family-friendly,Spacious living areas
2,32711,Near major landmarks,Family-friendly,Cozy atmosphere
3,34241,Central location,Close to cultural attractions,Charming old-town vibe
4,34981,Spacious living areas,Family-friendly,Fully equipped kitchen
...,...,...,...,...
19477,1234621063113396232,Central location,Fully equipped kitchen,Luxury apartment
19478,1234633310288552263,Central location,Ideal for couples,Luxury apartment
19479,1234636058660290258,Central location,Fully equipped kitchen,Luxury apartment
19480,1234659029532006287,Cozy atmosphere,Includes cleaning service,Ideal for couples


In [6]:
prague_neighborhoods_details

Unnamed: 0,Unique_Neighbourhoods,Exposure_Score,Rising_Star,Regulations
0,Anděl,4.0,Yes,Yes
1,Barrandov,3.0,Yes,Yes
2,Belárie,3.0,Yes,Yes.
3,Bližná,3.0,Yes,No.
4,Bohnice,2.0,No,Yes.
...,...,...,...,...
124,Řepy,2.0,No,Yes.
125,Şişli,4.0,Yes,Yes.
126,Štěrboholy,2.0,No,Yes
127,Žižkov,4.0,Yes,Yes


Append

In [7]:
relevant_features_description = pd.concat([prague_listings_split_with_id, barcelona_listings_split_with_id], ignore_index=True)
neighbourhood_features = pd.concat([prague_neighborhoods_details, barcelona_neighborhoods_details], ignore_index=True)

neighbourhood_features

Unnamed: 0,Unique_Neighbourhoods,Exposure_Score,Rising_Star,Regulations
0,Anděl,4.0,Yes,Yes
1,Barrandov,3.0,Yes,Yes
2,Belárie,3.0,Yes,Yes.
3,Bližná,3.0,Yes,No.
4,Bohnice,2.0,No,Yes.
...,...,...,...,...
271,Zona 9,3.0,No,Yes
272,el Fort Pienc,3.0,Yes,Yes
273,la Sagrada Familia,4.0,Yes,Yes
274,la Sagrada Família,4.0,Yes,Yes


In [8]:
neighbourhood_features = neighbourhood_features[neighbourhood_features['Unique_Neighbourhoods'].notna()].reset_index(drop=True)

neighbourhood_features

Unnamed: 0,Unique_Neighbourhoods,Exposure_Score,Rising_Star,Regulations
0,Anděl,4.0,Yes,Yes
1,Barrandov,3.0,Yes,Yes
2,Belárie,3.0,Yes,Yes.
3,Bližná,3.0,Yes,No.
4,Bohnice,2.0,No,Yes.
...,...,...,...,...
269,Zona 5,4.0,Yes,Yes.
270,Zona 9,3.0,No,Yes
271,el Fort Pienc,3.0,Yes,Yes
272,la Sagrada Familia,4.0,Yes,Yes


In [9]:
neighbourhood_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unique_Neighbourhoods  274 non-null    object 
 1   Exposure_Score         274 non-null    float64
 2   Rising_Star            274 non-null    object 
 3   Regulations            274 non-null    object 
dtypes: float64(1), object(3)
memory usage: 8.7+ KB


Cleaning

In [10]:
def clean_combined_dataframe(neighbourhood_features, columns):
    """
    Removes '.' from specified columns and ensures 'Exposure_Score' is an integer.
    """
    for column in columns:
        # Remove '.' from the column values
        neighbourhood_features[column] = neighbourhood_features[column].str.replace('.', '', regex=False)
    
    # Ensure 'Exposure_Score' is an integer
    if 'Exposure_Score' in neighbourhood_features.columns:
        neighbourhood_features['Exposure_Score'] = pd.to_numeric(neighbourhood_features['Exposure_Score'], errors='coerce').fillna(0).astype(int)
    
    return neighbourhood_features


In [11]:
# Apply the cleaning function over the neighbourhood_features DataFrame
if 'neighbourhood_features' in globals():
    neighbourhood_features = clean_combined_dataframe(
        neighbourhood_features, 
        ['Rising_Star', 'Regulations'])
else:
    print("The 'neighbourhood_features' DataFrame is not defined.")


In [12]:
# Function to convert all column names to snake_case (underscore)
def convert_columns_to_snake_case(df):
    df.columns = df.columns.str.strip().str.replace(" ", "_").str.lower()
    return df

# Apply the function to relevant DataFrames
if 'relevant_features_description' in globals():
    relevant_features_description = convert_columns_to_snake_case(relevant_features_description)

if 'neighbourhood_features' in globals():
    neighbourhood_features = convert_columns_to_snake_case(neighbourhood_features)


neighbourhood_features = neighbourhood_features.drop_duplicates()



FInal

In [13]:
relevant_features_description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28548 entries, 0 to 28547
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     28548 non-null  int64 
 1   description_feature_1  28288 non-null  object
 2   description_feature_2  27365 non-null  object
 3   description_feature_3  27362 non-null  object
dtypes: int64(1), object(3)
memory usage: 892.3+ KB


In [14]:
neighbourhood_features

Unnamed: 0,unique_neighbourhoods,exposure_score,rising_star,regulations
0,Anděl,4,Yes,Yes
1,Barrandov,3,Yes,Yes
2,Belárie,3,Yes,Yes
3,Bližná,3,Yes,No
4,Bohnice,2,No,Yes
...,...,...,...,...
269,Zona 5,4,Yes,Yes
270,Zona 9,3,No,Yes
271,el Fort Pienc,3,Yes,Yes
272,la Sagrada Familia,4,Yes,Yes


In [15]:
# Specify your desired path
desired_path = r"datalakesairbnb\rodrigo\DW"

# Ensure the directory exists
os.makedirs(desired_path, exist_ok=True)

# Save the DataFrames to CSV in the desired path
relevant_features_description.to_csv(os.path.join(desired_path, "relevant_features_description.csv"), index=False)
neighbourhood_features.to_csv(os.path.join(desired_path, "neighbourhood_features.csv"), index=False)

print(f"Files saved to {desired_path}")

Files saved to datalakesairbnb\rodrigo\DW


CREATE TABLE relevant_features_description (
    id BIGINT NOT NULL,
    description_feature_1 NVARCHAR(MAX),
    description_feature_2 NVARCHAR(MAX),
    description_feature_3 NVARCHAR(MAX)
);

CREATE TABLE neighbourhood_features (
    unique_neighbourhoods NVARCHAR(MAX) NOT NULL,
    exposure_score INT NOT NULL,
    rising_star NVARCHAR(MAX),
    regulations NVARCHAR(MAX)
);


In [31]:
from azure.storage.blob import BlobServiceClient
import pandas as pd

# SAS token and container URL
sas_token = "sp=racwdli&st=2024-12-11T22:47:10Z&se=2025-02-21T06:47:10Z&spr=https&sv=2022-11-02&sr=c&sig=mcDsrO1XhrkgXG2oDA7pkdbHE1sCEYHHMyU%2BnEvpNV8%3D"
container_url = "https://datalakestoragerentscape.blob.core.windows.net"

# Function to upload a DataFrame to Blob Storage
def upload_dataframe_to_blob(df, blob_name):
    # Save DataFrame to a local CSV file
    file_path = f"{blob_name}.csv"
    df.to_csv(file_path, index=False)

    # Connect to the Blob Container
    blob_service_client = BlobServiceClient(account_url=container_url, credential=sas_token)
    container_client = blob_service_client.get_container_client("cleansed-layer-airbnb")

    # Upload the CSV file to the Blob Storage
    with open(file_path, "rb") as data:
        container_client.upload_blob(name=blob_name, data=data, overwrite=True)
    
    print(f"Uploaded {blob_name} to {container_url}/cleansed-layer-airbnb")

# Update file names for the Blob Storage
upload_dataframe_to_blob(relevant_features_description, "cities_relevant_features_classification.csv")
upload_dataframe_to_blob(neighbourhood_features, "cities_neighbourhood_features.csv")



Uploaded cities_relevant_features_classification.csv to https://datalakestoragerentscape.blob.core.windows.net/cleansed-layer-airbnb
Uploaded cities_neighbourhood_features.csv to https://datalakestoragerentscape.blob.core.windows.net/cleansed-layer-airbnb
