## Data Cleaning

In [1]:
import pandas as pd
import string 
import re

In [2]:
df = pd.read_csv("listings.csv")

In [3]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2843,https://www.airbnb.com/rooms/2843,20220912200208,2022-09-12,city scrape,Fabulous downtown/airport room,Come stay in this trendy area of downtown Mont...,This area of St Henri is actually downtown Mon...,https://a0.muscache.com/pictures/397204dc-4123...,2319,...,4.73,4.73,4.64,,t,2,1,1,0,0.97
1,29059,https://www.airbnb.com/rooms/29059,20220912200208,2022-09-13,city scrape,Lovely studio Quartier Latin,CITQ 267153<br />Lovely studio with 1 closed r...,,https://a0.muscache.com/pictures/736399/fa6c31...,125031,...,4.78,4.81,4.71,,f,2,2,0,0,2.58
2,298059,https://www.airbnb.com/rooms/298059,20220912200208,2022-09-13,city scrape,Superb cottage Mile-End Plateau !,<b>The space</b><br />Superb bright cottage de...,,https://a0.muscache.com/pictures/f53f3590-103d...,1536474,...,5.0,4.63,4.84,,f,1,1,0,0,0.4
3,29061,https://www.airbnb.com/rooms/29061,20220912200208,2022-09-13,city scrape,Maison historique - Quartier Latin,Lovely historic house with plenty of period ch...,,https://a0.muscache.com/pictures/9e59d417-4b6a...,125031,...,4.79,4.88,4.71,,f,2,2,0,0,0.84
4,309367,https://www.airbnb.com/rooms/309367,20220912200208,2022-09-13,city scrape,Charming and Clean ! 5 min Métro,"Feel like home in a dynamic area, close to eve...",,https://a0.muscache.com/pictures/3954937/65518...,1562348,...,4.91,4.32,4.59,,f,3,3,0,0,0.18


In [4]:
# Converting object type to date time
df["last_scraped"] = pd.to_datetime(df["last_scraped"])
df["first_review"] = pd.to_datetime(df["first_review"])
df["last_review"] = pd.to_datetime(df["last_review"])
df["calendar_last_scraped"] = pd.to_datetime(df["calendar_last_scraped"])
df["host_since"] = pd.to_datetime(df["host_since"])

In [5]:
# Dropping unneccessary columns
df =df.drop(columns=['neighbourhood','host_thumbnail_url','picture_url','neighborhood_overview', 'host_about', 'neighbourhood_group_cleansed', 'bathrooms', 'calendar_updated', 'license'])

In [6]:
# Replacing object type null values with "None"
df[["description", 'host_url', "host_location","host_response_rate","host_acceptance_rate","beds", "listing_url","bedrooms","first_review","last_review", "host_neighbourhood"]] = df[["description", 'host_url', "host_location","host_response_rate","host_acceptance_rate","beds","listing_url","bedrooms","first_review","last_review","host_neighbourhood"]].fillna("None")

In [7]:
# Replacing int type null values with 0
df[["host_response_time", "host_response_rate","host_acceptance_rate","beds", "bedrooms","review_scores_rating","review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]] = df[["host_response_time", "host_response_rate","host_acceptance_rate","beds", "bedrooms","review_scores_rating","review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month"]].fillna(0)

In [8]:
# Dropping null values less than 20 in each column and removing duplicates
df = df.dropna()
df.drop_duplicates()
print("Null values in df = ", df.isnull().sum().sum())

Null values in df =  0


In [9]:
# Removing emojis and special characters
df["description"] = df["description"].str.replace('[^A-Za-z0-9]', ' ', flags=re.UNICODE, regex=True)

In [10]:
# Converting Price to numeric
df['price'] = df['price'].str.replace('$', '').str.replace(',','')
df['price'] = pd.to_numeric(df['price'])

  df['price'] = df['price'].str.replace('$', '').str.replace(',','')


In [11]:
df["price"]

0         60.0
1        152.0
2        180.0
3        286.0
4         64.0
         ...  
13616     90.0
13617    188.0
13618     68.0
13619    136.0
13620     90.0
Name: price, Length: 13599, dtype: float64

In [12]:
# Saving cleaned csv into a new file.
df.to_csv("cleaned_airbnb.csv")

In [13]:
df1 = pd.read_csv("cleaned_airbnb.csv")

In [14]:
df1.head()

Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0,2843,https://www.airbnb.com/rooms/2843,20220912200208,2022-09-12,city scrape,Fabulous downtown/airport room,Come stay in this trendy area of downtown Mont...,2319,https://www.airbnb.com/users/show/2319,...,4.61,4.73,4.73,4.64,t,2,1,1,0,0.97
1,1,29059,https://www.airbnb.com/rooms/29059,20220912200208,2022-09-13,city scrape,Lovely studio Quartier Latin,CITQ 267153 br Lovely studio with 1 closed r...,125031,https://www.airbnb.com/users/show/125031,...,4.81,4.78,4.81,4.71,f,2,2,0,0,2.58
2,2,298059,https://www.airbnb.com/rooms/298059,20220912200208,2022-09-13,city scrape,Superb cottage Mile-End Plateau !,b The space b br Superb bright cottage de...,1536474,https://www.airbnb.com/users/show/1536474,...,4.96,5.0,4.63,4.84,f,1,1,0,0,0.4
3,3,29061,https://www.airbnb.com/rooms/29061,20220912200208,2022-09-13,city scrape,Maison historique - Quartier Latin,Lovely historic house with plenty of period ch...,125031,https://www.airbnb.com/users/show/125031,...,4.85,4.79,4.88,4.71,f,2,2,0,0,0.84
4,4,309367,https://www.airbnb.com/rooms/309367,20220912200208,2022-09-13,city scrape,Charming and Clean ! 5 min Métro,Feel like home in a dynamic area close to eve...,1562348,https://www.airbnb.com/users/show/1562348,...,4.86,4.91,4.32,4.59,f,3,3,0,0,0.18
