# Import Necessary Libraries

In [1841]:
import pandas as pd
import gzip
import shutil

# Exploratory Data Analysis (EDA)

## Calendar Data

In [1842]:
#Load raw calendar data from file
raw_calendar_data = pd.read_csv('../Data/AirBnB/Asheville/2022/asheville_calendar.csv', compression='gzip')
raw_calendar_data

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,5868695,2022-09-14,f,$300.00,$240.00,4,1125
1,5868695,2022-09-15,f,$300.00,$240.00,4,1125
2,5868695,2022-09-16,f,$400.00,$320.00,4,1125
3,5868695,2022-09-17,f,$400.00,$320.00,4,1125
4,5868695,2022-09-18,f,$300.00,$240.00,4,1125
...,...,...,...,...,...,...,...
1098280,11510567,2023-09-09,t,$130.00,$130.00,2,1125
1098281,11510567,2023-09-10,t,$130.00,$130.00,2,1125
1098282,11510567,2023-09-11,t,$130.00,$130.00,2,1125
1098283,11510567,2023-09-12,t,$130.00,$130.00,2,1125


In [1843]:
#Check the info / types of data
raw_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   listing_id      1098285 non-null  int64 
 1   date            1098285 non-null  object
 2   available       1098285 non-null  object
 3   price           1098285 non-null  object
 4   adjusted_price  1098285 non-null  object
 5   minimum_nights  1098285 non-null  int64 
 6   maximum_nights  1098285 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 58.7+ MB


In [1844]:
#Convert date column to date-times type
mod_calendar_data = raw_calendar_data
mod_calendar_data['date'] = pd.to_datetime(raw_calendar_data['date'])

#Confirm the info / types of data
mod_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   listing_id      1098285 non-null  int64         
 1   date            1098285 non-null  datetime64[ns]
 2   available       1098285 non-null  object        
 3   price           1098285 non-null  object        
 4   adjusted_price  1098285 non-null  object        
 5   minimum_nights  1098285 non-null  int64         
 6   maximum_nights  1098285 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 58.7+ MB


In [1845]:
print(f"Earliest date of data set {raw_calendar_data['date'].min()}")
print(f"Latest date of data set {raw_calendar_data['date'].max()}")

Earliest date of data set 2022-09-14 00:00:00
Latest date of data set 2023-09-14 00:00:00


In [1846]:
#Check Values of 'Available column'
mod_calendar_data['available'].value_counts()

t    602662
f    495623
Name: available, dtype: int64

In [1847]:
#Replace the false and true values with 0 and 1
mod_calendar_data['available'] = mod_calendar_data['available'].replace(['f', 't'],[0, 1])

#Confirm available are now integers
mod_calendar_data['available'].value_counts()

1    602662
0    495623
Name: available, dtype: int64

In [1848]:
mod_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   listing_id      1098285 non-null  int64         
 1   date            1098285 non-null  datetime64[ns]
 2   available       1098285 non-null  int64         
 3   price           1098285 non-null  object        
 4   adjusted_price  1098285 non-null  object        
 5   minimum_nights  1098285 non-null  int64         
 6   maximum_nights  1098285 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 58.7+ MB


In [1849]:
mod_calendar_data['price'].value_counts()

$125.00      28816
$150.00      28256
$100.00      22632
$95.00       20149
$75.00       16726
             ...  
$2,382.00        1
$2,390.00        1
$2,191.00        1
$2,485.00        1
$2,047.00        1
Name: price, Length: 2497, dtype: int64

In [1850]:
mod_calendar_data['adjusted_price'].value_counts()

$125.00      28238
$150.00      28155
$100.00      22623
$95.00       20165
$75.00       16680
             ...  
$2,386.00        1
$1,809.00        1
$2,304.00        1
$2,406.00        1
$2,047.00        1
Name: adjusted_price, Length: 2493, dtype: int64

In [1851]:
test_price_df = mod_calendar_data['price'].str.split(".", expand=True)
test_price_df = test_price_df.rename(columns = {0 : 'daily_price'})
test_price_df = test_price_df.drop(labels = [1], axis = 1)
mod_calendar_data = mod_calendar_data.join(test_price_df, how = 'outer')
mod_calendar_data['daily_price'] = mod_calendar_data['daily_price'].str.replace('$', '')
mod_calendar_data['daily_price'] = mod_calendar_data['daily_price'].str.replace(',', '')
mod_calendar_data['daily_price'] = mod_calendar_data['daily_price'].astype({'daily_price':'int'})
mod_calendar_data = mod_calendar_data.drop(labels = ['adjusted_price', 'price'], axis = 1)
mod_calendar_data

  mod_calendar_data['daily_price'] = mod_calendar_data['daily_price'].str.replace('$', '')


Unnamed: 0,listing_id,date,available,minimum_nights,maximum_nights,daily_price
0,5868695,2022-09-14,0,4,1125,300
1,5868695,2022-09-15,0,4,1125,300
2,5868695,2022-09-16,0,4,1125,400
3,5868695,2022-09-17,0,4,1125,400
4,5868695,2022-09-18,0,4,1125,300
...,...,...,...,...,...,...
1098280,11510567,2023-09-09,1,2,1125,130
1098281,11510567,2023-09-10,1,2,1125,130
1098282,11510567,2023-09-11,1,2,1125,130
1098283,11510567,2023-09-12,1,2,1125,130


In [1852]:
mod_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   listing_id      1098285 non-null  int64         
 1   date            1098285 non-null  datetime64[ns]
 2   available       1098285 non-null  int64         
 3   minimum_nights  1098285 non-null  int64         
 4   maximum_nights  1098285 non-null  int64         
 5   daily_price     1098285 non-null  int64         
dtypes: datetime64[ns](1), int64(5)
memory usage: 50.3 MB


In [1853]:
#For purposes of this project/the stakeholder, we will drop the adjusted avail, price, min nights, max nights
mod_calendar_data = mod_calendar_data.drop(['available', 'minimum_nights', 'maximum_nights'],
                                          axis = 1)

In [1854]:
#Check all columns are integer types
mod_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   listing_id   1098285 non-null  int64         
 1   date         1098285 non-null  datetime64[ns]
 2   daily_price  1098285 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 25.1 MB


In [1855]:
# mod_calendar_data_id_avail = mod_calendar_data.groupby(['listing_id']).sum()
# mod_calendar_data_id_avail = mod_calendar_data_id_avail.reset_index()
# mod_calendar_data_id_avail = mod_calendar_data_id_avail.drop(['price'], axis = 1)
# mod_calendar_data_id_avail['perc_year_avail'] = mod_calendar_data_id_avail['available'] / 365
# mod_calendar_data_id_avail = mod_calendar_data_id_avail.drop(['available'], axis = 1)
# mod_calendar_data_id_avail

In [1856]:
# mod_calendar_data_id_avail.info()

## Listings Data

In [1857]:
raw_listings_data = pd.read_csv('../Data/AirBnB/Asheville/2022/asheville_listings.csv')
raw_listings_data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,108061,https://www.airbnb.com/rooms/108061,20220914194751,2022-09-14,city scrape,Walk to stores/parks/downtown. Fenced yard/Pet...,Walk to town in ten minutes! Monthly rental $1...,"I love my neighborhood! Its friendly, easy-goi...",https://a0.muscache.com/pictures/41011975/0cdf...,320564,...,4.79,4.84,4.48,,f,2,2,0,0,0.67
1,155305,https://www.airbnb.com/rooms/155305,20220914194751,2022-09-14,city scrape,Cottage! BonPaul + Sharky's Hostel,<b>The space</b><br />Private cottage located ...,"We are within easy walk of pubs, breweries, mu...",https://a0.muscache.com/pictures/8880711/cf38d...,746673,...,4.74,4.93,4.54,,t,7,1,2,4,2.72
2,156805,https://www.airbnb.com/rooms/156805,20220914194751,2022-09-14,previous scrape,"Private Room ""Ader"" at BPS Hostel",<b>The space</b><br />Private Rooms at Bon Pau...,"Easy walk to pubs, cafes, bakery, breweries, l...",https://a0.muscache.com/pictures/23447d55-fa7e...,746673,...,4.61,4.84,4.46,,t,7,1,2,4,0.5
3,156926,https://www.airbnb.com/rooms/156926,20220914194751,2022-09-14,city scrape,"Mixed Dorm ""Top Bunk #1"" at BPS Hostel",This is a top bunk in the mixed dorm room<br /...,,https://a0.muscache.com/pictures/98f4e655-c4d6...,746673,...,4.8,4.79,4.79,,t,7,1,2,4,2.34
4,197263,https://www.airbnb.com/rooms/197263,20220914194751,2022-09-14,city scrape,Tranquil Room & Private Bath,"This is a comfy, peaceful and clean room with ...",,https://a0.muscache.com/pictures/miso/Hosting-...,961396,...,4.92,4.82,4.98,,f,2,1,1,0,0.49


In [1858]:
raw_listings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3009 entries, 0 to 3008
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            3009 non-null   int64  
 1   listing_url                                   3009 non-null   object 
 2   scrape_id                                     3009 non-null   int64  
 3   last_scraped                                  3009 non-null   object 
 4   source                                        3009 non-null   object 
 5   name                                          3009 non-null   object 
 6   description                                   3004 non-null   object 
 7   neighborhood_overview                         2290 non-null   object 
 8   picture_url                                   3009 non-null   object 
 9   host_id                                       3009 non-null   i

In [1859]:
#Remove irrelevant review columns
mod_listing_data = raw_listings_data.drop(labels = ['review_scores_accuracy', 
                                                    'review_scores_cleanliness', 
                                                    'review_scores_checkin', 
                                                    'review_scores_communication', 
                                                    'review_scores_location',
                                                    'calendar_updated', 
                                                    'license'], axis = 1)

In [1860]:
#Remove irrelevant host columns
mod_listing_data = mod_listing_data.drop(labels = ['host_name', 
                                                   'host_since', 
                                                   'host_location', 
                                                   'host_about', 
                                                   'host_response_time', 
                                                   'host_acceptance_rate',
                                                   'host_thumbnail_url', 
                                                   'host_picture_url', 
                                                   'host_neighbourhood', 
                                                   'host_listings_count', 
                                                   'host_total_listings_count', 
                                                   'host_verifications', 
                                                   'host_has_profile_pic',
                                                   'first_review',
                                                   'last_review'], axis = 1)

In [1861]:
#Remove

mod_listing_data = mod_listing_data.drop(labels = ['scrape_id',
                                                  'last_scraped',
                                                  'source',
                                                  'neighbourhood_cleansed',
                                                  'neighbourhood_group_cleansed',
                                                  'property_type',
                                                  'bathrooms',
                                                  'price',
                                                   'minimum_minimum_nights',
                                                   'maximum_minimum_nights',
                                                   'minimum_maximum_nights',
                                                   'maximum_maximum_nights',
                                                   'minimum_nights_avg_ntm',
                                                   'maximum_nights_avg_ntm',
                                                   'has_availability',
                                                   'availability_30',
                                                   'availability_60',
                                                   'availability_90',
                                                   'availability_365',
                                                   'calendar_last_scraped',
                                                   'number_of_reviews_ltm',
                                                   'number_of_reviews_l30d',
                                                   'calculated_host_listings_count',
                                                   'calculated_host_listings_count_entire_homes',
                                                   'calculated_host_listings_count_private_rooms',
                                                   'calculated_host_listings_count_shared_rooms',
                                                  'neighborhood_overview',
                                                  'reviews_per_month',
                                                  'review_scores_value',
                                                  'host_response_rate',
                                                  'description'], axis = 1)

In [1862]:
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3009 entries, 0 to 3008
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3009 non-null   int64  
 1   listing_url             3009 non-null   object 
 2   name                    3009 non-null   object 
 3   picture_url             3009 non-null   object 
 4   host_id                 3009 non-null   int64  
 5   host_url                3009 non-null   object 
 6   host_is_superhost       3009 non-null   object 
 7   host_identity_verified  3009 non-null   object 
 8   neighbourhood           2291 non-null   object 
 9   latitude                3009 non-null   float64
 10  longitude               3009 non-null   float64
 11  room_type               3009 non-null   object 
 12  accommodates            3009 non-null   int64  
 13  bathrooms_text          3008 non-null   object 
 14  bedrooms                2850 non-null   

In [1863]:
#Fill the missing neighborhood data with 'Asheville, North Carolina, United States'
mod_listing_data['neighbourhood'] = mod_listing_data['neighbourhood'].fillna('Asheville, North Carolina, United States')
mod_listing_data['neighbourhood'].isna().sum()

0

In [1864]:
mod_listing_data['neighbourhood'].value_counts()
test_neighbor_df = mod_listing_data['neighbourhood'].str.split(",", expand=True)
test_neighbor_df = test_neighbor_df.rename(columns = {0 : 'neighborhood'})
test_neighbor_df = test_neighbor_df.drop(labels = [1, 2, 3, 4], axis = 1)
mod_listing_data = mod_listing_data.join(test_neighbor_df, how = 'outer')
mod_listing_data = mod_listing_data.drop(labels = ['neighbourhood'], axis = 1)
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3009 entries, 0 to 3008
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3009 non-null   int64  
 1   listing_url             3009 non-null   object 
 2   name                    3009 non-null   object 
 3   picture_url             3009 non-null   object 
 4   host_id                 3009 non-null   int64  
 5   host_url                3009 non-null   object 
 6   host_is_superhost       3009 non-null   object 
 7   host_identity_verified  3009 non-null   object 
 8   latitude                3009 non-null   float64
 9   longitude               3009 non-null   float64
 10  room_type               3009 non-null   object 
 11  accommodates            3009 non-null   int64  
 12  bathrooms_text          3008 non-null   object 
 13  bedrooms                2850 non-null   float64
 14  beds                    2980 non-null   

In [1865]:
#Remove rows with no bedrooms data
mod_listing_data = mod_listing_data.dropna(subset = ['bedrooms', 
                                                     'beds', 
                                                     'review_scores_rating', 
                                                     'bathrooms_text'], axis = 0)

In [1866]:
mod_listing_data.isna().sum()

id                        0
listing_url               0
name                      0
picture_url               0
host_id                   0
host_url                  0
host_is_superhost         0
host_identity_verified    0
latitude                  0
longitude                 0
room_type                 0
accommodates              0
bathrooms_text            0
bedrooms                  0
beds                      0
amenities                 0
minimum_nights            0
maximum_nights            0
number_of_reviews         0
review_scores_rating      0
instant_bookable          0
neighborhood              0
dtype: int64

In [1867]:
#Replace the false and true values with 0 and 1
mod_listing_data['host_is_superhost'] = mod_listing_data['host_is_superhost'].replace(['f', 't'],[0, 1])
mod_listing_data['host_is_superhost'].value_counts()

1    1786
0     849
Name: host_is_superhost, dtype: int64

In [1868]:
#Replace the false and true values with 0 and 1
mod_listing_data['host_identity_verified'] = mod_listing_data['host_identity_verified'].replace(['f', 't'],[0, 1])
mod_listing_data['host_identity_verified'].value_counts()

1    2104
0     531
Name: host_identity_verified, dtype: int64

In [1869]:
#Replace the false and true values with 0 and 1
mod_listing_data['instant_bookable'] = mod_listing_data['instant_bookable'].replace(['f', 't'],[0, 1])
mod_listing_data['instant_bookable'].value_counts()

1    1535
0    1100
Name: instant_bookable, dtype: int64

In [1870]:
mod_listing_data['bathrooms_text'].value_counts()

1 bath               1282
2 baths               558
1 private bath        206
2.5 baths             143
3 baths               131
1.5 baths             100
1 shared bath          74
3.5 baths              48
4 baths                34
2 shared baths         12
4.5 baths              10
5 baths                 8
6 baths                 7
2.5 shared baths        6
1.5 shared baths        5
5.5 baths               5
0 baths                 1
Shared half-bath        1
Half-bath               1
7 baths                 1
9 baths                 1
Private half-bath       1
Name: bathrooms_text, dtype: int64

In [1871]:
mod_listing_data['instant_bookable'].value_counts()

1    1535
0    1100
Name: instant_bookable, dtype: int64

In [1872]:
mod_listing_data['beds'].value_counts()

1.0     883
2.0     775
3.0     429
4.0     242
5.0     123
6.0      79
7.0      49
8.0      16
11.0     11
9.0       9
12.0      6
10.0      6
15.0      3
13.0      1
18.0      1
21.0      1
16.0      1
Name: beds, dtype: int64

In [1873]:
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2635 entries, 0 to 2999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      2635 non-null   int64  
 1   listing_url             2635 non-null   object 
 2   name                    2635 non-null   object 
 3   picture_url             2635 non-null   object 
 4   host_id                 2635 non-null   int64  
 5   host_url                2635 non-null   object 
 6   host_is_superhost       2635 non-null   int64  
 7   host_identity_verified  2635 non-null   int64  
 8   latitude                2635 non-null   float64
 9   longitude               2635 non-null   float64
 10  room_type               2635 non-null   object 
 11  accommodates            2635 non-null   int64  
 12  bathrooms_text          2635 non-null   object 
 13  bedrooms                2635 non-null   float64
 14  beds                    2635 non-null   

In [1874]:
mod_listing_data['bathrooms_text'].isna().sum()

0

In [1875]:
mod_listing_data['bathrooms_text'].value_counts()

1 bath               1282
2 baths               558
1 private bath        206
2.5 baths             143
3 baths               131
1.5 baths             100
1 shared bath          74
3.5 baths              48
4 baths                34
2 shared baths         12
4.5 baths              10
5 baths                 8
6 baths                 7
2.5 shared baths        6
1.5 shared baths        5
5.5 baths               5
0 baths                 1
Shared half-bath        1
Half-bath               1
7 baths                 1
9 baths                 1
Private half-bath       1
Name: bathrooms_text, dtype: int64

In [1876]:
mod_listing_data['bathrooms_text'] = mod_listing_data['bathrooms_text'].replace(['Shared half-bath', 'Half-bath', 
                                                                                'Private half-bath'],
                                                                                 [0.5, 0.5, 0.5])

In [1877]:
mod_listing_data['bathrooms_text'].isna().sum()

0

In [1878]:
mod_listing_data['bathrooms_text'].value_counts()

1 bath              1282
2 baths              558
1 private bath       206
2.5 baths            143
3 baths              131
1.5 baths            100
1 shared bath         74
3.5 baths             48
4 baths               34
2 shared baths        12
4.5 baths             10
5 baths                8
6 baths                7
2.5 shared baths       6
5.5 baths              5
1.5 shared baths       5
0.5                    3
7 baths                1
0 baths                1
9 baths                1
Name: bathrooms_text, dtype: int64

In [1879]:
# mod_listing_data['bathrooms_text'] = mod_listing_data['bathrooms_text'].str.split(" ", n = 1, expand = True)

test_df = mod_listing_data['bathrooms_text'].str.split(" ", expand=True)
test_df = test_df.rename(columns = {0 : 'bathrooms'})
test_df = test_df.drop(labels = [1, 2], axis = 1)
mod_listing_data = mod_listing_data.join(test_df, how = 'outer')
mod_listing_data = mod_listing_data.drop(labels = ['bathrooms_text'], axis = 1)
mod_listing_data['bathrooms'] = mod_listing_data['bathrooms'].fillna(0.5)
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2635 entries, 0 to 2999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      2635 non-null   int64  
 1   listing_url             2635 non-null   object 
 2   name                    2635 non-null   object 
 3   picture_url             2635 non-null   object 
 4   host_id                 2635 non-null   int64  
 5   host_url                2635 non-null   object 
 6   host_is_superhost       2635 non-null   int64  
 7   host_identity_verified  2635 non-null   int64  
 8   latitude                2635 non-null   float64
 9   longitude               2635 non-null   float64
 10  room_type               2635 non-null   object 
 11  accommodates            2635 non-null   int64  
 12  bedrooms                2635 non-null   float64
 13  beds                    2635 non-null   float64
 14  amenities               2635 non-null   

In [1880]:
target_amenities = ['Air conditioning', 'Wifi', 'TV', 'Kitchen', 'Washer', 'Dryer', 'Heating']

In [1881]:
def add_amenity_column(amenity_name):
    mod_listing_data[amenity_name] = mod_listing_data['amenities'].str.contains(amenity_name, regex=False)
    mod_listing_data[amenity_name] = mod_listing_data[amenity_name].replace([False, True],[0, 1])

In [1882]:
add_amenity_column(target_amenities[0])
add_amenity_column(target_amenities[1])
add_amenity_column(target_amenities[2])
add_amenity_column(target_amenities[3])
add_amenity_column(target_amenities[4])
add_amenity_column(target_amenities[5])
add_amenity_column(target_amenities[6])

In [1883]:
mod_listing_data = mod_listing_data.drop(['amenities'], axis = 1)

In [1884]:
mod_listing_data['listing_id'] = mod_listing_data['id']
mod_listing_data = mod_listing_data.drop(['id'], axis = 1)
column_to_move = mod_listing_data.pop('listing_id')
mod_listing_data.insert(0, 'listing_id', column_to_move)
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2635 entries, 0 to 2999
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   listing_id              2635 non-null   int64  
 1   listing_url             2635 non-null   object 
 2   name                    2635 non-null   object 
 3   picture_url             2635 non-null   object 
 4   host_id                 2635 non-null   int64  
 5   host_url                2635 non-null   object 
 6   host_is_superhost       2635 non-null   int64  
 7   host_identity_verified  2635 non-null   int64  
 8   latitude                2635 non-null   float64
 9   longitude               2635 non-null   float64
 10  room_type               2635 non-null   object 
 11  accommodates            2635 non-null   int64  
 12  bedrooms                2635 non-null   float64
 13  beds                    2635 non-null   float64
 14  minimum_nights          2635 non-null   

## Combine Calendar Data and Listings Data

In [1885]:
mod_calendar_data

Unnamed: 0,listing_id,date,daily_price
0,5868695,2022-09-14,300
1,5868695,2022-09-15,300
2,5868695,2022-09-16,400
3,5868695,2022-09-17,400
4,5868695,2022-09-18,300
...,...,...,...
1098280,11510567,2023-09-09,130
1098281,11510567,2023-09-10,130
1098282,11510567,2023-09-11,130
1098283,11510567,2023-09-12,130


In [1886]:
mod_calendar_data['listing_id']

0           5868695
1           5868695
2           5868695
3           5868695
4           5868695
             ...   
1098280    11510567
1098281    11510567
1098282    11510567
1098283    11510567
1098284    11510567
Name: listing_id, Length: 1098285, dtype: int64

In [1887]:
print(mod_listing_data.columns)
mod_calendar_data.columns

Index(['listing_id', 'listing_url', 'name', 'picture_url', 'host_id',
       'host_url', 'host_is_superhost', 'host_identity_verified', 'latitude',
       'longitude', 'room_type', 'accommodates', 'bedrooms', 'beds',
       'minimum_nights', 'maximum_nights', 'number_of_reviews',
       'review_scores_rating', 'instant_bookable', 'neighborhood', 'bathrooms',
       'Air conditioning', 'Wifi', 'TV', 'Kitchen', 'Washer', 'Dryer',
       'Heating'],
      dtype='object')


Index(['listing_id', 'date', 'daily_price'], dtype='object')

In [1888]:
mod_calendar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098285 entries, 0 to 1098284
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   listing_id   1098285 non-null  int64         
 1   date         1098285 non-null  datetime64[ns]
 2   daily_price  1098285 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 25.1 MB


In [1889]:
mod_listing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2635 entries, 0 to 2999
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   listing_id              2635 non-null   int64  
 1   listing_url             2635 non-null   object 
 2   name                    2635 non-null   object 
 3   picture_url             2635 non-null   object 
 4   host_id                 2635 non-null   int64  
 5   host_url                2635 non-null   object 
 6   host_is_superhost       2635 non-null   int64  
 7   host_identity_verified  2635 non-null   int64  
 8   latitude                2635 non-null   float64
 9   longitude               2635 non-null   float64
 10  room_type               2635 non-null   object 
 11  accommodates            2635 non-null   int64  
 12  bedrooms                2635 non-null   float64
 13  beds                    2635 non-null   float64
 14  minimum_nights          2635 non-null   

In [1890]:
asheville_data = pd.merge(mod_listing_data, mod_calendar_data, how = 'left', on = 'listing_id')
asheville_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 961775 entries, 0 to 961774
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   listing_id              961775 non-null  int64         
 1   listing_url             961775 non-null  object        
 2   name                    961775 non-null  object        
 3   picture_url             961775 non-null  object        
 4   host_id                 961775 non-null  int64         
 5   host_url                961775 non-null  object        
 6   host_is_superhost       961775 non-null  int64         
 7   host_identity_verified  961775 non-null  int64         
 8   latitude                961775 non-null  float64       
 9   longitude               961775 non-null  float64       
 10  room_type               961775 non-null  object        
 11  accommodates            961775 non-null  int64         
 12  bedrooms                961775