In [1]:
# data_cleaning.ipynb

# Importing necessary libraries
import pandas as pd

# Load the datasets from the CSV files with a specified encoding
airbnb_data = pd.read_csv('../data/airbnb_data.csv', encoding='ISO-8859-1')
hotel_data = pd.read_csv('../data/hotel_data.csv', encoding='ISO-8859-1')

# Display the first few rows of the datasets
print("Airbnb Data:")
display(airbnb_data.head())
print("Hotel Data:")
display(hotel_data.head())

# Checking for missing values
print("Missing values in Airbnb Data:")
print(airbnb_data.isnull().sum())

print("Missing values in Hotel Data:")
print(hotel_data.isnull().sum())

# Handling missing values
# For Airbnb data, we can drop rows with missing prices, as they are critical for analysis.
airbnb_data = airbnb_data.dropna(subset=['price'])

# For hotel data, we can fill missing high and low rates with the average rates.
hotel_data['high_rate'].fillna(hotel_data['high_rate'].mean(), inplace=True)
hotel_data['low_rate'].fillna(hotel_data['low_rate'].mean(), inplace=True)

# Verify if missing values are handled
print("Missing values in Airbnb Data after cleaning:")
print(airbnb_data.isnull().sum())

print("Missing values in Hotel Data after cleaning:")
print(hotel_data.isnull().sum())

# Standardizing column names for consistency
airbnb_data.columns = airbnb_data.columns.str.lower().str.replace(' ', '_')
hotel_data.columns = hotel_data.columns.str.lower().str.replace(' ', '_')

# Display cleaned data
print("Cleaned Airbnb Data:")
display(airbnb_data.head())
print("Cleaned Hotel Data:")
display(hotel_data.head())

# Save the cleaned data to CSV files
airbnb_data.to_csv('../data/cleaned_airbnb_data.csv', index=False, encoding='ISO-8859-1')
hotel_data.to_csv('../data/cleaned_hotel_data.csv', index=False, encoding='ISO-8859-1')

print("Cleaned data saved to 'cleaned_airbnb_data.csv' and 'cleaned_hotel_data.csv'.")







Airbnb Data:


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


Hotel Data:


Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


Missing values in Airbnb Data:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64
Missing values in Hotel Data:
ean_hotel_id      0
name              0
address1          0
city              0
state_province    0
postal_code       0
latitude          0
longitude         0
star_rating       1
high_rate         0
low_rate          0
dtype: int64
Missing values in Airbnb Data after cleaning:
id     

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


Cleaned Hotel Data:


Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


Cleaned data saved to 'cleaned_airbnb_data.csv' and 'cleaned_hotel_data.csv'.
