# Data Exploration

In this notebook, we will explore the datasets provided for the project. We will load the data, check for missing values, and understand the structure of the files.

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

## List Data Files

In [23]:
data_dir = '../data'
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/reviews.csv
../data/test.csv
../data/calendar.csv
../data/train.csv
../data/sample_submission.csv


## Load Datasets

In [24]:
try:
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
    reviews_df = pd.read_csv(os.path.join(data_dir, 'reviews.csv'))
    calendar_df = pd.read_csv(os.path.join(data_dir, 'calendar.csv'))
    sample_submission_df = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
    print("All datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")

All datasets loaded successfully.


## Explore Train Data

In [25]:
print("Train Data Shape:", train_df.shape)
display(train_df.head())

Train Data Shape: (24153, 58)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,0,0,25436,https://www.airbnb.com/rooms/25436,20250627144659,2025-06-28,previous scrape,In the forest Sea view Two minutes to the city.,Our house is in Baby Koru. If you want village...,Clean and elegand friends . U can find everyth...,https://a0.muscache.com/pictures/b64c3ef0-2256...,105823,https://www.airbnb.com/users/show/105823,Yesim,2010-04-09,"İstanbul, Turkey","Merhabalar . Kuş sesi , akan su sesi ile uyan...",,,,f,https://a0.muscache.com/im/pictures/user/067a8...,https://a0.muscache.com/im/pictures/user/067a8...,Beşiktaş,1.0,1.0,"['email', 'phone']",t,t,"Beşiktaş/ bebek, İstanbul, Turkey",Besiktas,,41.07883,29.03863,Entire rental unit,Entire home/apt,3,,1 bath,2.0,,"[""Room-darkening shades"", ""Wine glasses"", ""Sel...",,100,160,,,,,,,,,f,1,1,0,0
1,1,1,34177,https://www.airbnb.com/rooms/34177,20250627144659,2025-07-01,city scrape,PETIT HOUSE,My petit house is located in the bosphorous an...,the neighbourhood is very calm comparing the...,https://a0.muscache.com/pictures/47356451/c288...,147330,https://www.airbnb.com/users/show/147330,,,,,,,,f,,,,,,,,,"Beşiktaş, İstanbul, Turkey",Besiktas,,41.06681,29.04035,Entire home,Entire home/apt,3,1.0,1 bath,1.0,2.0,"[""Free dryer \u2013 In unit"", ""Extra pillows a...",1794.0,100,365,4.69,4.69,4.88,4.88,4.81,4.69,4.81,,f,5,5,0,0
2,2,2,42835,https://www.airbnb.com/rooms/42835,20250627144659,2025-06-28,previous scrape,Cozy apartment in the heart of Istanbul,Welcome to our cozy 2-bedroom apartment locate...,,https://a0.muscache.com/pictures/miso/Hosting-...,187026,https://www.airbnb.com/users/show/187026,Attila,2010-07-31,"Istanbul, Turkey",I am from Istanbul/ Turkey. I will do my best ...,,,,f,https://a0.muscache.com/im/users/187026/profil...,https://a0.muscache.com/im/users/187026/profil...,Şişli,3.0,3.0,"['email', 'phone']",t,t,,Sisli,,41.04303,28.98531,Entire rental unit,Entire home/apt,4,,2 baths,1.0,,"[""Wine glasses"", ""Laundromat nearby"", ""Cleanin...",,100,730,,,,,,,,,f,3,1,2,0
3,3,3,73477,https://www.airbnb.com/rooms/73477,20250627144659,2025-07-01,city scrape,Sea View Apartment in Taksim/ Center of Istanbul,Sea View Apartment in Taksim/ Center of Istanbul,Gumussuyu,https://a0.muscache.com/pictures/409813ff-21c0...,383789,https://www.airbnb.com/users/show/383789,Berat,2011-02-09,"İstanbul, Turkey",Im a young professional who lives both in Ista...,within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/User-...,https://a0.muscache.com/im/pictures/user/User-...,Taksim,2.0,2.0,"['email', 'phone']",t,t,"Beyoğlu, İstanbul, Turkey",Beyoglu,,41.034447,28.987555,Entire rental unit,Entire home/apt,2,1.0,1 bath,1.0,2.0,"[""Window AC unit"", ""Extra pillows and blankets...",2331.0,100,360,5.0,5.0,4.82,5.0,5.0,5.0,4.91,,f,2,2,0,0
4,4,4,77292,https://www.airbnb.com/rooms/77292,20250627144659,2025-06-30,previous scrape,3 Bedroom Apartment with Terrace Bosphorus View,Who doesn't want to stay in a quiet place that...,"Gümüssuyu 3+1 Apartment with Terrace, offers y...",https://a0.muscache.com/pictures/1070c15c-8d09...,414003,https://www.airbnb.com/users/show/414003,Omer Faruk,2011-02-28,"İstanbul, Turkey","Merkezi konumda, çevresinde şık kafe ve restor...",within a few hours,100%,48%,f,https://a0.muscache.com/im/pictures/user/User-...,https://a0.muscache.com/im/pictures/user/User-...,Taksim,14.0,18.0,"['email', 'phone']",t,t,"İstanbul, Istanbul, Turkey",Beyoglu,,41.03583,28.98989,Entire rental unit,Entire home/apt,6,,2 baths,3.0,,"[""Dishwasher"", ""Cooking basics"", ""TV with stan...",,2,730,4.36,4.14,4.36,4.86,4.71,4.64,4.21,34-554,f,13,10,3,0


In [26]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24153 entries, 0 to 24152
Data columns (total 58 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Unnamed: 0.1                                  24153 non-null  int64  
 1   Unnamed: 0                                    24153 non-null  int64  
 2   id                                            24153 non-null  int64  
 3   listing_url                                   24153 non-null  object 
 4   scrape_id                                     24153 non-null  int64  
 5   last_scraped                                  24153 non-null  object 
 6   source                                        24153 non-null  object 
 7   name                                          24153 non-null  object 
 8   description                                   23501 non-null  object 
 9   neighborhood_overview                         6905 non-null  

In [27]:
train_df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
count,24153.0,24153.0,24153.0,24153.0,24153.0,23485.0,23485.0,0.0,24153.0,24153.0,24153.0,20839.0,23639.0,20871.0,24153.0,24153.0,15052.0,15050.0,15052.0,15050.0,15052.0,15050.0,15049.0,24153.0,24153.0,24153.0,24153.0
mean,12076.0,14850.732621,8.485073e+17,20250630000000.0,351288700.0,26.02657,29.657654,,41.02932,28.972679,3.538981,1.21263,1.521384,2.095875,58.22138,470.528878,4.582673,4.615631,4.536973,4.720405,4.740084,4.644599,4.541681,23.77183,21.446156,2.179522,0.033039
std,6972.514862,8554.309302,4.72101e+17,4.57822,204564400.0,82.050365,100.810116,,0.048364,0.154331,2.177697,0.901427,1.643558,2.20429,61.847969,310.405535,0.691445,0.666244,0.708628,0.612142,0.597183,0.595877,0.684091,79.558098,79.817644,5.48471,0.300686
min,0.0,0.0,25436.0,20250630000000.0,105823.0,1.0,1.0,,40.81546,28.00757,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,6038.0,7445.0,6.634897e+17,20250630000000.0,165956000.0,2.0,2.0,,41.005032,28.96528,2.0,1.0,1.0,1.0,2.0,365.0,4.51,4.57,4.47,4.75,4.77,4.57,4.5,1.0,1.0,0.0,0.0
50%,12076.0,14932.0,9.490009e+17,20250630000000.0,411013600.0,5.0,7.0,,41.031507,28.98028,3.0,1.0,1.0,2.0,100.0,365.0,4.8,4.83,4.76,4.92,4.94,4.83,4.74,5.0,2.0,0.0,0.0
75%,18114.0,22277.0,1.233129e+18,20250630000000.0,508532700.0,14.0,17.0,,41.048843,29.004379,4.0,1.0,2.0,3.0,100.0,365.0,5.0,5.0,5.0,5.0,5.0,5.0,4.94,13.0,10.0,2.0,0.0
max,24152.0,29636.0,1.452164e+18,20250630000000.0,703416900.0,750.0,5373.0,,41.48668,29.87079,16.0,50.0,50.0,54.0,999.0,9999.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,530.0,530.0,60.0,8.0


In [28]:
print("Missing Values in Train Data:")
print(train_df.isnull().sum()[train_df.isnull().sum() > 0])

Missing Values in Train Data:
description                       652
neighborhood_overview           17248
host_name                         660
host_since                        668
host_location                    8794
host_about                      14797
host_response_time               8186
host_response_rate               8186
host_acceptance_rate             7207
host_is_superhost                 480
host_thumbnail_url                668
host_picture_url                  668
host_neighbourhood              21544
host_listings_count               668
host_total_listings_count         668
host_verifications                668
host_has_profile_pic              668
host_identity_verified            668
neighbourhood                   17248
neighbourhood_group_cleansed    24153
bathrooms                        3314
bathrooms_text                    104
bedrooms                          514
beds                             3282
price                            3349
review_scores_rating

## Explore Test Data

In [None]:
print("Test Data Shape:", test_df.shape)
display(test_df.head())

In [None]:
test_df.info()

In [None]:
print("Missing Values in Test Data:")
print(test_df.isnull().sum()[test_df.isnull().sum() > 0])

## Explore Reviews Data

In [None]:
print("Reviews Data Shape:", reviews_df.shape)
display(reviews_df.head())

In [None]:
reviews_df.info()

## Explore Calendar Data

In [None]:
print("Calendar Data Shape:", calendar_df.shape)
display(calendar_df.head())

In [None]:
calendar_df.info()

## Explore Sample Submission

In [None]:
print("Sample Submission Shape:", sample_submission_df.shape)
display(sample_submission_df.head())

## Data Cleaning & Further Exploration

In [None]:
# Clean 'price' column in Train Data
if 'price' in train_df.columns:
    # Remove '$' and ',' and convert to float
    train_df['price'] = train_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    print("Price column cleaned and converted to float.")
    
    # Display basic statistics for Price
    print(train_df['price'].describe())

In [None]:
# Visualize Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['price'], bins=50, kde=True)
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check Date Ranges in Calendar and Reviews
print("Calendar Date Range:", calendar_df['date'].min(), "to", calendar_df['date'].max())
print("Reviews Date Range:", reviews_df['date'].min(), "to", reviews_df['date'].max())

In [None]:
# Check for overlapping IDs between Train and Test
train_ids = set(train_df['id'])
test_ids = set(test_df['id'])
overlap = train_ids.intersection(test_ids)
print(f"Number of overlapping IDs between Train and Test: {len(overlap)}")