# Airbnb lab

In [1]:
import pandas as pd

In [2]:
airbnb_csv = pd.read_csv('AB_NYC_2019.csv')

In [5]:
airbnb_csv['occupancy'] = 365 - airbnb_csv['availability_365']

In [8]:
# airbnb_csv['occupancy']

In [10]:
# airbnb_csv

*  host
    * include the host name, and host id
    
* A location belongs to a neighborhoods 
    * neighborhood_id, latitude, longitude
* A neighborhood belongs to a neighborhood group

* listing 
    * name, host_id, location_id, room_type, price, occupancy

* listing 
    * name, host_id, location_id, room_type, price, occupancy

In [11]:
airbnb_csv.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'occupancy'],
      dtype='object')

In [14]:
host_df = airbnb_csv[['host_name', 'host_id']]
host_df.columns = ['host_name', 'bnb_id']

In [18]:
host_df.index = host_df['bnb_id']

In [21]:
host_df.head()

Unnamed: 0_level_0,host_name,bnb_id
bnb_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2787,John,2787
2845,Jennifer,2845
4632,Elisabeth,4632
4869,LisaRoxanne,4869
7192,Laura,7192


In [27]:
host_idx = host_df['bnb_id'].unique()

In [35]:
host_df = host_df.drop_duplicates()

In [36]:
host_df.shape

(37457, 2)

In [51]:
import sqlite3
conn = sqlite3.connect('airbnb.db')
cursor = conn.cursor()

In [52]:
host_df['host_name'].to_sql('hosts', conn, index = True, index_label = 'id')

In [53]:
cursor.execute('select * from hosts limit 1')

<sqlite3.Cursor at 0x11e883c70>

In [54]:
cursor.fetchall()

[(2787, 'John')]

* neighborhood group

In [61]:
location_cols = ['neighbourhood_group', 'neighbourhood', 'latitude', 'longitude']

In [65]:
neighbor_df = airbnb_csv[['neighbourhood_group', 'neighbourhood']].drop_duplicates()

In [67]:
neighbor_df.shape

(221, 2)

In [90]:
neighbor_df.columns = ['neighbourhood_group', 'name']

In [95]:
cursor.execute('drop table neighborhoods;')

<sqlite3.Cursor at 0x11e883c70>

In [96]:
neighbor_df[['name', 'neighbourhood_group']].to_sql('neighborhoods', conn, index = True, index_label = 'id')

In [97]:
cursor.execute('PRAGMA table_info(neighborhoods)')

<sqlite3.Cursor at 0x11e883c70>

In [98]:
cursor.fetchall()

[(0, 'id', 'INTEGER', 0, None, 0),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'neighbourhood_group', 'TEXT', 0, None, 0)]

* location

In [78]:
airbnb_csv[['neighbourhood_group', 'neighbourhood', 'longitude', 'latitude']].shape

(48895, 4)

In [83]:
location_df = airbnb_csv[['neighbourhood', 'longitude', 'latitude']].drop_duplicates()

In [84]:
location_df.head()

Unnamed: 0,neighbourhood,longitude,latitude
0,Kensington,-73.97237,40.64749
1,Midtown,-73.98377,40.75362
2,Harlem,-73.9419,40.80902
3,Clinton Hill,-73.95976,40.68514
4,East Harlem,-73.94399,40.79851


In [85]:
location_arr = location_df.to_numpy()

In [99]:
location_arr

array([['Kensington', -73.97237, 40.647490000000005],
       ['Midtown', -73.98376999999999, 40.75362],
       ['Harlem', -73.9419, 40.809020000000004],
       ...,
       ['Harlem', -73.94866999999999, 40.81475],
       ["Hell's Kitchen", -73.99112, 40.757509999999996],
       ["Hell's Kitchen", -73.98933000000001, 40.76404]], dtype=object)

In [101]:
import numpy as np
location_rows = []
for location_name, long, lat in location_arr:
    cursor.execute('select id from neighborhoods where name = ?', (location_name,))
    neighborhood_id = cursor.fetchone()[0]
    data_row = np.array([location_name, neighborhood_id, long, lat])
    location_rows.append(data_row)

In [102]:
location_arr = np.array(location_rows)

In [103]:
location_arr

array([['Kensington', '0', '-73.97237', '40.647490000000005'],
       ['Midtown', '1', '-73.98376999999999', '40.75362'],
       ['Harlem', '2', '-73.9419', '40.809020000000004'],
       ...,
       ['Harlem', '2', '-73.94866999999999', '40.81475'],
       ["Hell's Kitchen", '7', '-73.99112', '40.757509999999996'],
       ["Hell's Kitchen", '7', '-73.98933000000001', '40.76404']],
      dtype='<U26')

In [106]:
neighborhood_ids = pd.to_numeric(location_arr[:, 1])

In [107]:
neighborhood_ids

array([0, 1, 2, ..., 2, 7, 7])

In [109]:
location_df['neighborhood_id'] = neighborhood_ids

In [111]:
location_df.dtypes

neighbourhood       object
longitude          float64
latitude           float64
neighborhood_id      int64
dtype: object

In [120]:
location_df = location_df[['longitude', 'latitude', 'neighborhood_id']]

In [121]:
location_df.head()

Unnamed: 0,longitude,latitude,neighborhood_id
0,-73.97237,40.64749,0
1,-73.98377,40.75362,1
2,-73.9419,40.80902,2
3,-73.95976,40.68514,3
4,-73.94399,40.79851,4


In [122]:
conn.execute('drop table locations;')

<sqlite3.Cursor at 0x12032b500>

In [123]:
location_df.to_sql('locations', conn, index = True, index_label='id')

In [124]:
cursor.execute('PRAGMA table_info(locations)')

<sqlite3.Cursor at 0x11e883c70>

In [125]:
cursor.fetchall()

[(0, 'id', 'INTEGER', 0, None, 0),
 (1, 'longitude', 'REAL', 0, None, 0),
 (2, 'latitude', 'REAL', 0, None, 0),
 (3, 'neighborhood_id', 'INTEGER', 0, None, 0)]

In [126]:
cursor.execute('SELECT name from sqlite_master where type= "table"')
cursor.fetchall()

[('hosts',), ('neighborhoods',), ('locations',)]

In [127]:
airbnb_csv.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,occupancy
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,10
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365,0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,171
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,365


In [133]:
selected_bnb_df = airbnb_csv[['name', 'host_id', 'latitude', 'longitude', 'room_type', 'price', 'number_of_reviews', 'calculated_host_listings_count', 'occupancy']]

In [134]:
selected_bnb_df.shape

(48895, 9)

In [132]:
cursor.execute('select count(*) from locations;')
cursor.fetchall()

[(48871,)]

In [142]:
selected_bnb_arr = selected_bnb_df.to_numpy()

In [143]:
selected_bnb_arr

array([['Clean & quiet apt home by the park', 2787, 40.647490000000005,
        ..., 9, 6, 0],
       ['Skylit Midtown Castle', 2845, 40.75362, ..., 45, 2, 10],
       ['THE VILLAGE OF HARLEM....NEW YORK !', 4632, 40.809020000000004,
        ..., 0, 1, 0],
       ...,
       ['Sunny Studio at Historical Neighborhood', 23492952, 40.81475,
        ..., 0, 1, 338],
       ['43rd St. Time Square-cozy single bed', 30985759,
        40.757509999999996, ..., 0, 6, 363],
       ["Trendy duplex in the very heart of Hell's Kitchen", 68119814,
        40.76404, ..., 0, 1, 342]], dtype=object)

In [157]:
selected_bnb_arr[:1]

array([['Clean & quiet apt home by the park', 2787, 40.647490000000005,
        -73.97237, 'Private room', 149, 9, 6, 0]], dtype=object)

In [158]:
listing_rows = []
for name, host_id, latitude, longitude, room_type, price, number_of_reviews, calculated_host_listings_count, occupancy in selected_bnb_arr:
    cursor.execute('select id from locations WHERE latitude = ? AND longitude = ?', (latitude, longitude))
    location_id = cursor.fetchone()
    data_row = np.array([name, host_id, location_id, longitude, latitude, room_type, price, number_of_reviews, calculated_host_listings_count, occupancy])
    listing_rows.append(data_row)

In [159]:
listing_arr = np.array(listing_rows)

In [160]:
listing_arr.shape

(48895, 10)

In [164]:
location_ids = [tuple_num[0] for tuple_num in listing_arr[:, 2]]
    

In [167]:
location_arr = np.array(location_ids)


In [168]:
location_arr.shape

(48895,)

In [175]:
listing_arr[:1]

array([['Clean & quiet apt home by the park', 2787, (0,), -73.97237,
        40.647490000000005, 'Private room', 149, 9, 6, 0]], dtype=object)

In [176]:
# listing_arr = np.array(listing_rows)
cols = ['name', 'host_id', 'location_id', 'latitude', 'longitude', 'room_type', 'price', 'number_of_reviews', 'calculated_host_listings_count', 'occupancy']
listing_updated_df = pd.DataFrame(listing_arr, columns = cols, index = range(1, 48896))


In [179]:
listing_updated_df['location_id'] = location_arr

In [182]:
selected_listing_df = listing_updated_df[listing_updated_df.columns.difference(['latitude', 'longitude'])]

In [184]:
selected_listing_df.columns

Index(['calculated_host_listings_count', 'host_id', 'location_id', 'name',
       'number_of_reviews', 'occupancy', 'price', 'room_type'],
      dtype='object')

In [186]:
listing_cols = ['name', 'host_id', 'location_id',
       'number_of_reviews', 'occupancy', 'price', 'room_type', 'calculated_host_listings_count']

In [188]:
selected_listing_df = selected_listing_df[listing_cols]

In [189]:
selected_listing_df.columns

Index(['name', 'host_id', 'location_id', 'number_of_reviews', 'occupancy',
       'price', 'room_type', 'calculated_host_listings_count'],
      dtype='object')

In [190]:
selected_listing_df.columns = ['name', 'host_id', 'location_id', 'number_of_reviews', 'occupancy',
       'price', 'room_type', 'host_listings_count']

In [193]:
selected_listing_df.dtypes.to_dict()

{'name': dtype('O'),
 'host_id': dtype('O'),
 'location_id': dtype('int64'),
 'number_of_reviews': dtype('O'),
 'occupancy': dtype('O'),
 'price': dtype('O'),
 'room_type': dtype('O'),
 'host_listings_count': dtype('O')}

In [195]:
df_dtypes = {'name': 'Object',
 'host_id': 'int64',
 'location_id': 'int64',
 'number_of_reviews': 'Object',
 'occupancy': 'Object',
 'price': 'int64',
 'room_type': 'Object',
 'host_listings_count': 'int64'}

In [197]:
selected_listing_df

In [200]:
price_ser = pd.to_numeric(selected_listing_df['price'])

In [201]:
listings_count_ser = pd.to_numeric(selected_listing_df['host_listings_count'])

In [None]:
host_ser = pd.to_numeric(selected_listing_df['host_id'])

In [203]:
selected_listing_df['host_id'] = host_ser

In [204]:
selected_listing_df['price'] = price_ser

In [205]:
selected_listing_df['host_listings_count'] = listings_count_ser

In [206]:
selected_listing_df.dtypes

name                   object
host_id                 int64
location_id             int64
number_of_reviews      object
occupancy              object
price                   int64
room_type              object
host_listings_count     int64
dtype: object

In [207]:
occup_ser = pd.to_numeric(selected_listing_df['occupancy'])

In [208]:
number_of_reviews_ser = pd.to_numeric(selected_listing_df['number_of_reviews'])

In [209]:
selected_listing_df['number_of_reviews'] = number_of_reviews_ser

In [210]:
selected_listing_df['occupancy'] = occup_ser

In [211]:
selected_listing_df.dtypes

name                   object
host_id                 int64
location_id             int64
number_of_reviews       int64
occupancy               int64
price                   int64
room_type              object
host_listings_count     int64
dtype: object

In [212]:
selected_listing_df.to_sql('listings', conn, index = True, index_label = 'id')

In [213]:
cursor.execute('PRAGMA table_info(listings);')
cursor.fetchall()

[(0, 'id', 'INTEGER', 0, None, 0),
 (1, 'name', 'TEXT', 0, None, 0),
 (2, 'host_id', 'INTEGER', 0, None, 0),
 (3, 'location_id', 'INTEGER', 0, None, 0),
 (4, 'number_of_reviews', 'INTEGER', 0, None, 0),
 (5, 'occupancy', 'INTEGER', 0, None, 0),
 (6, 'price', 'INTEGER', 0, None, 0),
 (7, 'room_type', 'TEXT', 0, None, 0),
 (8, 'host_listings_count', 'INTEGER', 0, None, 0)]

In [214]:
cursor.execute('SELECT name from sqlite_master where type= "table"')
cursor.fetchall()

[('hosts',), ('neighborhoods',), ('locations',), ('listings',)]