In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Loading the dataset
file_path = 'C:/Users/Reshma/OneDrive/Desktop/oasis/AB_NYC_2019.csv' 
data = pd.read_csv(file_path)



In [3]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
# Checking missing values in each column
missing_values = data.isnull().sum()
print(missing_values)


id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [5]:
# Removing  rows with missing 'name' and 'host_name'
data = data.dropna(subset=['name', 'host_name'])

# Filling missing values in 'reviews_per_month' with 0 
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

# For 'last_review', replacing the missing values with a placeholder date
data['last_review'] = pd.to_datetime(data['last_review'], errors='coerce')
data['last_review'] = data['last_review'].fillna(pd.Timestamp('2000-01-01'))


In [6]:
#Checking missing values after removing the rows with no value
missing_values = data.isnull().sum()
print(missing_values)

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


In [7]:
# Remove duplicate rows (if any)
data = data.drop_duplicates()


In [8]:
# Converting 'last_review' column to datetime format
data['last_review'] = pd.to_datetime(data['last_review'], errors='coerce')

# Converting 'price' and 'reviews_per_month' to numeric data types
data['price'] = pd.to_numeric(data['price'], errors='coerce')
data['reviews_per_month'] = pd.to_numeric(data['reviews_per_month'], errors='coerce')

# Ensuring categorical columns are strings
data['neighbourhood_group'] = data['neighbourhood_group'].astype(str)
data['room_type'] = data['room_type'].astype(str)


In [9]:
# Defining a function to clean the 'name' column which has special characters
def clean_name(name):
    # Using regex to remove any character that is not a letter (A-Z, a-z) or a space
    return re.sub(r'[^A-Za-z\s]', '', str(name))

# Applying the function to the 'name' column to clean it
data['name'] = data['name'].apply(clean_name)

In [10]:
# Rounding the 'longitude' column to 2 decimal places
data['longitude'] = data['longitude'].round(2)

In [11]:
#Removing the rows where 'last_review' is missing
data = data.dropna(subset=['last_review'])

In [12]:
def clean_host_name(host_name):
    # Using regex to remove any character that is not a letter (A-Z, a-z) or a space
    return re.sub(r'[^A-Za-z\s]', '', str(host_name))

# Applying the function to the 'host_name' column to clean it
data['host_name'] = data['host_name'].apply(clean_host_name)

In [13]:
# Rounding the 'reviews_per_month' column with decimal places
data['reviews_per_month'] = data['reviews_per_month'].round(0)

In [14]:
 #Saving the cleaned dataset 
cleaned_file_path = 'C:/Users/Reshma/OneDrive/Desktop/oasis/AB_NYC_2019_cleaned1.csv'
data.to_csv(cleaned_file_path, index=False)