In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../Data/geneva_listings.csv")
print("Shape of dataset:", df.shape)
print("Column names:\n", df.columns.tolist())

df.head(2)


Shape of dataset: (2458, 75)
Column names:
 ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'ava

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,42515.0,https://www.airbnb.com/rooms/42515,20230900000000.0,9/23/2023,city scrape,Rental unit in Geneva · ★4.73 · 1 bedroom · 1 ...,<b>The space</b><br />This is a private room w...,,https://a0.muscache.com/pictures/10640277/ff1d...,185647,...,4.83,4.84,4.51,,f,1,0,1,0,0.5
1,107438.0,https://www.airbnb.com/rooms/107438,20230900000000.0,9/23/2023,city scrape,Rental unit in Geneva · ★4.87 · 1 bedroom · 1 ...,<b>The space</b><br />Version Française et Ang...,,https://a0.muscache.com/pictures/93bb00cc-03bb...,556499,...,4.76,4.57,4.57,,f,1,0,1,0,0.18


In [2]:
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100

missing_data = pd.DataFrame({'Missing Count': missing_counts, 'Missing %': missing_percent})
missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)
missing_data.head(10)

Unnamed: 0,Missing Count,Missing %
bathrooms,2458,100.0
license,2458,100.0
neighbourhood_group_cleansed,2458,100.0
calendar_updated,2458,100.0
host_neighbourhood,2422,98.535395
neighbourhood,1273,51.790073
neighborhood_overview,1273,51.790073
host_about,1243,50.569569
bedrooms,842,34.255492
host_response_time,534,21.72498


In [3]:
# Drop columns with > 50% missing values
cols_to_drop = ['license', 'neighbourhood_group_cleansed',
                'calendar_updated', 'host_neighbourhood',
                'neighbourhood', 'host_about']
df_clean = df.drop(columns=cols_to_drop)

df_clean['bedrooms'] = df_clean['bedrooms'].fillna(df_clean['bedrooms'].median())
df_clean['bathrooms'] = df_clean['bathrooms'].fillna(df_clean['bedrooms'].median())
df_clean['host_response_time'] = df_clean['host_response_time'].fillna(df_clean['host_response_time'].mode()[0])

### Step 1: Cleaning Up Missing Data

After loading the Geneva Airbnb dataset, we explored the extent of missing values across all columns. Several columns — including `bathrooms`, `license`, `calendar_updated`, and `neighbourhood_group_cleansed` — were missing data in 100% of the rows. Additionally, `host_neighbourhood`, `neighbourhood`, `neighborhood_overview`, and `host_about` were missing in over 50% of the records. Since these fields provided limited usable information, we decided to drop them from the dataset.

For columns with moderate missingness, we opted for imputation to preserve as much data as possible:
- `bedrooms`, being a numerical variable, was imputed using the median to avoid the influence of outliers.
- `host_response_time`, a categorical feature, was filled using the mode, assuming the most frequently observed behavior when data was unavailable.

This preprocessing step allowed us to maintain the structure and richness of the dataset while ensuring consistency and reliability for further analysis.


In [4]:
df_clean['price'] = df_clean['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Save cleaned data
df_clean.to_csv('../data/airbnb_cleaned.csv', index=False)
print("Cleaned dataset saved to ../data/airbnb_cleaned.csv")


Cleaned dataset saved to ../data/airbnb_cleaned.csv
