### LIBRARIES

In [8]:
import pandas as pd
import zstandard as zstd
import io

### FILE PATHS

In [17]:
CALENDAR_DATA_V1_PATH = "../data/raw/v1/calendar.csv.zst"
LISTINGS_DATA_V1_PATH = '../data/raw/v1/listings.csv.zst'
REVIEWS_DATA_V1_PATH = '../data/raw/v1/reviews.csv.zst'
SESSIONS_DATA_V1_PATH = '../data/raw/v1/sessions.csv.zst'
USERS_DATA_V1_PATH = '../data/raw/v1/users.csv.zst'



#### LOAD DATA FROM csv.zst

In [None]:
# creating a dataframe for .csv.zst files
def create_dataframe_from_csv_zst(filepath: str) -> pd.DataFrame:

    with open(filepath, 'rb') as compressed:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(compressed) as reader:
            decompressed = io.TextIOWrapper(reader, encoding='utf-8')
            df = pd.read_csv(decompressed, low_memory=False)
    return df


In [19]:
calendar_dataframe = create_dataframe_from_csv_zst(CALENDAR_DATA_V1_PATH)
listings_dataframe = create_dataframe_from_csv_zst(LISTINGS_DATA_V1_PATH)
reviews_dataframe = create_dataframe_from_csv_zst(REVIEWS_DATA_V1_PATH)
sessions_dataframe = create_dataframe_from_csv_zst(SESSIONS_DATA_V1_PATH)
users_dataframe = create_dataframe_from_csv_zst(USERS_DATA_V1_PATH)

In [27]:
# checking missing values in the dataframes
def check_dataframe_missing_values(df: pd.DataFrame) -> None:
    total_values = len(df)

    df_columns = df.columns

    for col in df_columns:
        missing_values = df[col].isnull().sum()
        missing_percentage = (missing_values / total_values) * 100
        print(f"Column: {col}, Missing Values: {missing_values}, Missing Percentage: {missing_percentage:.2f}%")

### CALENDAR DF

In [22]:
calendar_dataframe.head(5)

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,30142520.0,2025-02-04,t,$80.00,,2.0,1125.0
1,49766630.0,2025-01-23,t,$125.00,,1.0,1125.0
2,49300560.0,2025-07-30,,$100.00,,,
3,7.431842e+17,2025-03-06,t,,,1.0,1125.0
4,,,t,,,5.0,30.0


#### Missing values

In [28]:
check_dataframe_missing_values(calendar_dataframe)

Column: listing_id, Missing Values: 320571, Missing Percentage: 19.99%
Column: date, Missing Values: 320764, Missing Percentage: 20.01%
Column: available, Missing Values: 320643, Missing Percentage: 20.00%
Column: price, Missing Values: 320420, Missing Percentage: 19.99%
Column: adjusted_price, Missing Values: 1603286, Missing Percentage: 100.00%
Column: minimum_nights, Missing Values: 320421, Missing Percentage: 19.99%
Column: maximum_nights, Missing Values: 321261, Missing Percentage: 20.04%


### LISTINGS DF

In [21]:
listings_dataframe.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,30419470.0,https://www.nocarz.pl/rooms/30419466,20241230000000.0,2024-12-25,,Athenian Niche in Plaka | Athenian Homes,,,https://a0.muscache.com/pictures/f6649b43-fcf6...,135482103.0,...,4.97,,4.91,287535.0,t,12.0,12.0,0.0,0.0,2.51
1,49982680.0,https://www.nocarz.pl/rooms/49982681,20241230000000.0,2024-12-26,city scrape,Athenian Apartments - Syntagma sq. #1,Experience the vibrant heartbeat of Athens fro...,There are plenty of sights to see in Athens in...,,,...,,4.91,,2033718.0,t,70.0,70.0,0.0,0.0,
2,48800720.0,https://www.nocarz.pl/rooms/48800718,20241230000000.0,2024-12-25,city scrape,"Modern Penthouse with Parking, Terrace & BBQ",You will be staying at a unique rooftop Duplex...,Koukaki is a residential area of Athens but at...,https://a0.muscache.com/pictures/miso/Hosting-...,113548208.0,...,4.79,4.71,4.74,3035440.0,t,90.0,,0.0,0.0,3.08
3,1.101264e+18,https://www.nocarz.pl/rooms/1101264403993187936,20241230000000.0,2024-12-25,,,At Ma Maison Downtown Villa you will discover ...,Gazi is ideal for discovering Athens because o...,https://a0.muscache.com/pictures/hosting/Hosti...,118181401.0,...,5.0,4.66,4.86,2489817.0,t,8.0,8.0,,0.0,3.56
4,,,20241230000000.0,2024-12-25,city scrape,Stylish Contemporary Penthouse - Hip Athens Ce...,"Located in Kolonaki, Central Athens’ upmarket ...",,https://a0.muscache.com/pictures/miso/Hosting-...,4899687.0,...,4.99,4.91,4.81,,t,1.0,1.0,0.0,0.0,3.76


In [29]:
check_dataframe_missing_values(listings_dataframe)

Column: id, Missing Values: 909, Missing Percentage: 20.70%
Column: listing_url, Missing Values: 881, Missing Percentage: 20.06%
Column: scrape_id, Missing Values: 891, Missing Percentage: 20.29%
Column: last_scraped, Missing Values: 864, Missing Percentage: 19.67%
Column: source, Missing Values: 852, Missing Percentage: 19.40%
Column: name, Missing Values: 888, Missing Percentage: 20.22%
Column: description, Missing Values: 899, Missing Percentage: 20.47%
Column: neighborhood_overview, Missing Values: 2540, Missing Percentage: 57.83%
Column: picture_url, Missing Values: 883, Missing Percentage: 20.10%
Column: host_id, Missing Values: 892, Missing Percentage: 20.31%
Column: host_url, Missing Values: 872, Missing Percentage: 19.85%
Column: host_name, Missing Values: 854, Missing Percentage: 19.44%
Column: host_since, Missing Values: 921, Missing Percentage: 20.97%
Column: host_location, Missing Values: 1945, Missing Percentage: 44.29%
Column: host_about, Missing Values: 2584, Missing Pe

### REVIEWS DF

In [23]:
reviews_dataframe.head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,15029864.0,1.208519e+18,2024-07-25,,Daniel,"ניקוס מאוד עזר, וזמן תגובה ממש מהיר. המיקום נה..."
1,23124958.0,8.396945e+17,2023-03-04,501218607.0,Ivancillo,Everything was perfect and the location was fa...
2,22865582.0,6.338012e+17,,282184671.0,Isak,Nice
3,51336810.0,6.279988e+17,2022-05-16,,Kenna,We had such a great time staying here! The loc...
4,,1.038245e+18,,,Charly,"Très bon logement. <br/>Bien situé, propre. <..."


In [30]:
check_dataframe_missing_values(reviews_dataframe)

Column: listing_id, Missing Values: 46517, Missing Percentage: 19.96%
Column: id, Missing Values: 46902, Missing Percentage: 20.12%
Column: date, Missing Values: 46494, Missing Percentage: 19.95%
Column: reviewer_id, Missing Values: 46497, Missing Percentage: 19.95%
Column: reviewer_name, Missing Values: 46743, Missing Percentage: 20.06%
Column: comments, Missing Values: 46397, Missing Percentage: 19.91%


### SESSIONS DF

In [32]:
sessions_dataframe.head(5)

Unnamed: 0,action,user_id,timestamp,listing_id
0,browse_listings,,2024-06-30T14:44:43.340297,
1,view_listing,24106857.0,2024-06-30T15:08:59.340297,51696960.0
2,view_listing,,2024-06-30T15:13:38.340297,9.023092e+17
3,view_listing,24106857.0,,1.126008e+18
4,,24106857.0,,


In [33]:
check_dataframe_missing_values(sessions_dataframe)

Column: action, Missing Values: 629445, Missing Percentage: 20.00%
Column: user_id, Missing Values: 628980, Missing Percentage: 19.99%
Column: timestamp, Missing Values: 629690, Missing Percentage: 20.01%
Column: listing_id, Missing Values: 815573, Missing Percentage: 25.92%


### USERS DF

In [25]:
users_dataframe.head(5)

Unnamed: 0,id,name,surname,city,street,street_number,postal_code
0,432845450.0,Julita,,Malbork,,,29-597
1,45257839.0,Apolonia,Jaroch,Sosnowiec,Kwiatowa,29/56,
2,484782310.0,Kalina,Dubik,Będzin,Dobra,97,
3,486291306.0,,Leonowicz,Luboń,Partyzantów,757,80-705
4,,Marianna,Ciołczyk,Oleśnica,Kreta,,89-001


In [34]:
check_dataframe_missing_values(users_dataframe)

Column: id, Missing Values: 43826, Missing Percentage: 19.97%
Column: name, Missing Values: 44086, Missing Percentage: 20.09%
Column: surname, Missing Values: 43815, Missing Percentage: 19.96%
Column: city, Missing Values: 43863, Missing Percentage: 19.98%
Column: street, Missing Values: 44045, Missing Percentage: 20.07%
Column: street_number, Missing Values: 43690, Missing Percentage: 19.91%
Column: postal_code, Missing Values: 43905, Missing Percentage: 20.00%
