## Importing Wandb and reading the sample.csv stored in wandb

In [1]:
 import wandb
 import pandas as pd

 run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
 local_path = wandb.use_artifact("sample.csv:latest").file()
 df = pd.read_csv(local_path)

[34m[1mwandb[0m: Currently logged in as: [33mt-shakthi[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Using ydata_profiling instead of pandas_profiling since pandas_profiling is no longer in use

## Importing the ProfileReport and generating a report

In [2]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [3]:
df['last_review']

0        2019-05-26
1               NaN
2        2018-09-19
3        2019-05-24
4        2019-06-23
            ...    
19995    2016-08-27
19996    2019-05-21
19997    2019-05-23
19998    2019-07-01
19999    2019-04-28
Name: last_review, Length: 20000, dtype: object

In [4]:
df['reviews_per_month']

0        0.13
1         NaN
2        1.12
3        0.65
4        0.52
         ... 
19995    0.04
19996    0.50
19997    0.50
19998    1.48
19999    1.07
Name: reviews_per_month, Length: 20000, dtype: float64

In [5]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")

In [6]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [7]:
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

In [8]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")

In [9]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
19990,5192459,Quiet Room in 4BR UWS Brownstone,10677483,Greg,Manhattan,Upper West Side,40.80173,-73.96625,Private room,70,1,0,NaT,,1,0
19991,1327940,Huge Gorgeous Park View Apartment!,3290436,Hadar,Brooklyn,Flatbush,40.65335,-73.96257,Entire home/apt,120,3,13,2016-08-27,0.28,2,327
19992,23612681,Shared Room 1 Stop from Manhattan on the F Train,55724558,Taylor,Queens,Long Island City,40.76006,-73.9408,Private room,55,4,2,2019-06-01,0.65,5,89
19993,34485745,Midtown Manhattan Stunner - Private room,261632622,Royalton,Manhattan,Theater District,40.75491,-73.98507,Private room,100,1,3,2019-06-16,3.0,9,318
19994,25616250,"Stylish, spacious, private 1BR apt in Ditmas Park",125396920,Adam,Brooklyn,Flatbush,40.64314,-73.95705,Entire home/apt,75,3,10,2019-01-03,0.84,1,0
19995,7094539,Tranquil haven in bubbly Brooklyn,2052211,Adriana,Brooklyn,Windsor Terrace,40.6536,-73.97546,Entire home/apt,143,14,2,2016-08-27,0.04,1,10
19996,4424261,Large 1 BR with backyard on UWS,3447311,Sarah,Manhattan,Upper West Side,40.80188,-73.96808,Entire home/apt,200,2,22,2019-05-21,0.5,1,0
19997,4545882,Amazing studio/Loft with a backyard,23569951,Kaveh,Manhattan,Upper East Side,40.7811,-73.94567,Entire home/apt,220,3,28,2019-05-23,0.5,1,293
19998,26518547,U2 comfortable double bed sleeps 2 guests,295128,Carol Gloria,Bronx,Clason Point,40.81225,-73.85502,Private room,80,1,4,2019-07-01,1.48,7,365
19999,33631782,Private Bedroom in Williamsburg Apt!,8569221,Andi,Brooklyn,Williamsburg,40.71829,-73.95819,Private room,109,3,3,2019-04-28,1.07,2,97


VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [11]:
import ydata_profiling
profile = ydata_profiling.ProfileReport(df)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Notes

- The last_review has 3758 missing values (19.8%)
- The reviews_per_month has 3758 missing values (19.8%)
- number_of_reviews has 3758 zeros (19.8%)
- availability_365 has 6970 zeros (36.7%)
- minimum_nights is highly skewed

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  number_

In [13]:
run.finish()