### Loading Data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
airbnb_data=pd.read_csv('AB_NYC_2019.csv')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [9]:
airbnb_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,21-05-2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05-07-2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19-11-2018,0.1,1,0


### Data Cleaning

In [19]:
airbnb_data.shape

(48895, 16)

In [28]:
airbnb_data.nunique()

id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [20]:
airbnb_data.info('all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [21]:
airbnb_data.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.095022,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.594493,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.04,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.37,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,1.58,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [3]:
100*airbnb_data.isnull().mean()

id                                 0.000000
name                               0.032723
host_id                            0.000000
host_name                          0.042949
neighbourhood_group                0.000000
neighbourhood                      0.000000
latitude                           0.000000
longitude                          0.000000
room_type                          0.000000
price                              0.000000
minimum_nights                     0.000000
number_of_reviews                  0.000000
last_review                       20.558339
reviews_per_month                 20.558339
calculated_host_listings_count     0.000000
availability_365                   0.000000
dtype: float64

In [4]:
airbnb_data["last_review"]=airbnb_data["last_review"].fillna(airbnb_data["last_review"].mode()[0])
airbnb_data["reviews_per_month"]=airbnb_data["reviews_per_month"].fillna(airbnb_data["reviews_per_month"].median())
airbnb_data["name"]=airbnb_data["name"].fillna(airbnb_data["name"].mode()[0])
airbnb_data["host_name"]=airbnb_data["host_name"].fillna(airbnb_data["host_name"].mode()[0])

In [5]:
100*airbnb_data.isnull().mean()

id                                0.0
name                              0.0
host_id                           0.0
host_name                         0.0
neighbourhood_group               0.0
neighbourhood                     0.0
latitude                          0.0
longitude                         0.0
room_type                         0.0
price                             0.0
minimum_nights                    0.0
number_of_reviews                 0.0
last_review                       0.0
reviews_per_month                 0.0
calculated_host_listings_count    0.0
availability_365                  0.0
dtype: float64

### Dealing with data

In [6]:
airbnb_data['longitude']= abs(airbnb_data['longitude'])
airbnb_data['longitude'].describe()

count    48895.000000
mean        73.952170
std          0.046157
min         73.712990
25%         73.936275
50%         73.955680
75%         73.983070
max         74.244420
Name: longitude, dtype: float64

In [7]:
airbnb_data.sort_values(by='availability_365', ascending=False).head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
11591,9024549,Spacious & Lofted in Williamsburg,8313697,Koppel,Brooklyn,Park Slope,40.673,73.98076,Private room,120,20,0,23-06-2019,0.72,1,365
41757,32467835,SPACIOUS & COMFY 3 BDROM/2 BATH APT CLOSE 2 SU...,216419980,Billie,Manhattan,Harlem,40.8242,73.95106,Entire home/apt,112,30,0,23-06-2019,0.72,1,365
41747,32466179,Sonder | The Nash | Classic 1BR + Fitness Center,219517861,Sonder (NYC),Manhattan,Murray Hill,40.74898,73.97534,Entire home/apt,202,29,0,23-06-2019,0.72,327,365
41742,32463948,Amazing East Village Loft,238779678,Dustin,Manhattan,East Village,40.72704,73.98916,Private room,55,30,0,23-06-2019,0.72,9,365


In [45]:
airbnb_data['availability_365']=airbnb_data['availability_365'].apply(lambda x:365 if x>365 else x)

In [46]:
airbnb_data.sort_values(by='availability_365', ascending=False).head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
11591,9024549,Spacious & Lofted in Williamsburg,8313697,Koppel,Brooklyn,Park Slope,40.673,73.98076,Private room,120,20,0,23-06-2019,0.02,1,365
41757,32467835,SPACIOUS & COMFY 3 BDROM/2 BATH APT CLOSE 2 SU...,216419980,Billie,Manhattan,Harlem,40.8242,73.95106,Entire home/apt,112,30,0,23-06-2019,0.02,1,365
41747,32466179,Sonder | The Nash | Classic 1BR + Fitness Center,219517861,Sonder (NYC),Manhattan,Murray Hill,40.74898,73.97534,Entire home/apt,202,29,0,23-06-2019,0.02,327,365
41742,32463948,Amazing East Village Loft,238779678,Dustin,Manhattan,East Village,40.72704,73.98916,Private room,55,30,0,23-06-2019,0.02,9,365
