Understanding The Data Set

In [1]:
import pandas as pd
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,21-05-2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05-07-2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19-11-2018,0.1,1,0


Ensuring Correct Data Types

In [2]:
df["id"] = df["id"].astype(str)
df["host_id"] = df["host_id"].astype(str)
df["latitude"] = df["latitude"].astype(str)
df["longitude"] = df["longitude"].astype(str)

Mathematical Summary of Data

In [3]:
df.describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48935.0,48935.0,48935.0,38883.0,48935.0,48935.0
mean,152.73804,7.030387,23.301339,1.372586,7.13945,112.846776
std,240.073239,20.503874,44.577954,1.6799,32.939447,131.633588
min,0.0,1.0,0.0,0.01,1.0,0.0
25%,69.0,1.0,1.0,0.19,1.0,0.0
50%,106.0,3.0,5.0,0.72,1.0,45.0
75%,175.0,5.0,24.0,2.02,2.0,227.0
max,10000.0,1250.0,629.0,58.5,327.0,365.0


Analyzing Categorical Data

In [4]:
df.nunique()

id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [5]:
df["room_type"].value_counts()

room_type
Entire home/apt    25437
Private room       22336
Shared room         1162
Name: count, dtype: int64

In [6]:
df["room_type"].value_counts(normalize=True)

room_type
Entire home/apt    0.519812
Private room       0.456442
Shared room        0.023746
Name: proportion, dtype: float64

Neighborhood Group Analysis

In [7]:
df["neighbourhood_group"].value_counts()

neighbourhood_group
Manhattan        21681
Brooklyn         20118
Queens            5672
Bronx             1091
Staten Island      373
Name: count, dtype: int64

Analyzing Numerical Data

In [13]:
df["price"].value_counts(bins=5)

(-10.001, 2000.0]    48849
(2000.0, 4000.0]        54
(4000.0, 6000.0]        16
(6000.0, 8000.0]         9
(8000.0, 10000.0]        7
Name: count, dtype: int64

In [None]:
bins = [-10, 0, 50, 100, 200, 500, 800, 2000, 4000, 10000]
df["price"].value_counts(bins=bins)

(50.0, 100.0]        17375
(100.0, 200.0]       16599
(200.0, 500.0]        7352
(0.0, 50.0]           6554
(500.0, 800.0]         624
(800.0, 2000.0]        334
(2000.0, 4000.0]        54
(4000.0, 10000.0]       32
(-10.001, 0.0]          11
Name: count, dtype: int64

Measures of Central Tendency

In [23]:
print(df["price"].mean())
print(df["price"].median())
print(df["price"].std())

152.7380402574844
106.0
240.07323939965107


Measures of Spread (Skewness & Kurtosis)

In [25]:
print(df["price"].skew())
print(df["price"].kurt())

19.1224884801201
585.9797075281948


Availability Analysis

In [26]:
df[df["availability_365"] == 365].shape[0]

1297

Correlation Analysis

In [28]:
df.corr(numeric_only=True)

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.042791,-0.047706,-0.03059,0.05745,0.081958
minimum_nights,0.042791,1.0,-0.08019,-0.121789,0.127941,0.14416
number_of_reviews,-0.047706,-0.08019,1.0,0.549638,-0.072404,0.172335
reviews_per_month,-0.03059,-0.121789,0.549638,1.0,-0.009369,0.185489
calculated_host_listings_count,0.05745,0.127941,-0.072404,-0.009369,1.0,0.225505
availability_365,0.081958,0.14416,0.172335,0.185489,0.225505,1.0
