In [2]:
import wandb
import pandas as pd
import pandas_profiling

In [3]:
run = wandb.init(project="nyc_airbnb", job_type='data_check', group="eda", save_code=True)  

[34m[1mwandb[0m: Currently logged in as: [33mshashank-salian[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Get data from WandB

In [4]:
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)
print(df.shape)

(20000, 16)


In [5]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


### Pandas profiler to check the data properties

In [None]:
profile = pandas_profiling.ProfileReport(df)
profile.to_widgets()

### Basic checks on the data 

In [6]:
# Drop outliers
# Filter rows based on price range as given by business stakeholders
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()


In [7]:
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

In [8]:
# check rooom types
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [9]:
# ensure all records havenumber of nights > 0
df[df['minimum_nights']<0]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365


In [10]:
print(df.shape)

(19001, 16)


### final check and end run

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  nu

In [12]:
run.finish()