In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv('data/AB_NYC_2019.csv')

df.describe()


Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [23]:
print(df.shape)
df.isna().sum().sort_values(ascending=False)

(48895, 16)


reviews_per_month                 10052
last_review                       10052
host_name                            21
name                                 16
availability_365                      0
calculated_host_listings_count        0
number_of_reviews                     0
minimum_nights                        0
price                                 0
room_type                             0
longitude                             0
latitude                              0
neighbourhood                         0
neighbourhood_group                   0
host_id                               0
id                                    0
dtype: int64

In [11]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [44]:
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display, Markdown
import ipywidgets as widgets

def barGraph(dim, limit):
    plt.figure(figsize=(15,10))
    plt.clf()
    plt.bar(df[dim].value_counts().index[:limit],df[dim].value_counts().values[:limit])
    plt.title(dim)
    plt.xticks(rotation=90)
    
def scatter(x, y):
    plt.figure(figsize=(15,10))
    plt.clf()
    plt.scatter(df[x],df[y])
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(x+" vs "+y)

def histogram(x, b):
    plt.figure(figsize=(15,10))
    plt.clf()
    plt.hist(df[x], bins=b)
    plt.title(x)


In [48]:
display(Markdown('# Data Exploration'))
display(Markdown('## Select plot type'))

@interact(plotType=widgets.RadioButtons(options=['bar','scatter','histogram'], value='bar'))

def showPlot(plotType):
    if plotType == 'bar':
        display(Markdown('## Select categorical columns'))
        cols = df.select_dtypes(include='object').columns
        interact(barGraph, dim=widgets.Dropdown(options= cols, value=cols[3], description="Dimension"), 
                 limit = widgets.IntSlider(value=10, min=1, max=30, step=1, description="Limit"))
    elif plotType == 'scatter':
        display(Markdown('## Select numeric columns'))
        cols = df.select_dtypes(include='number').columns
        interact(scatter, x = widgets.Dropdown(options= cols, value=cols[4], description="x"),
                y = widgets.Dropdown(options= cols, value=cols[7], description="y"))
    else:
        display(Markdown('## Select numeric column'))
        cols = df.select_dtypes(include='number').columns
        interact(histogram, x = widgets.Dropdown(options= cols, value=cols[4], description="x"),
                b = widgets.IntSlider(value=10, min=1, max=30, step=1, description="Bins"))
    

# Data Exploration

## Select plot type

interactive(children=(RadioButtons(description='plotType', options=('bar', 'scatter', 'histogram'), value='bar…