IMPORT LIBRARIES

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


LOADING DATA

In [2]:
import pandas as pd

print("Trying to load file...")

try:
    df = pd.read_csv("AB_NYC_2019.csv")
    print("SUCCESS: df created")
    print(df.head())
except Exception as e:
    print("FAILED:")
    print(e)



Trying to load file...
SUCCESS: df created
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room   

In [None]:
LOADING DATASET

In [3]:
import pandas as pd

df = pd.read_csv("AB_NYC_2019.csv")
print("Dataset loaded successfully")


Dataset loaded successfully


DATA QUALITY REPORT

In [4]:
print("\n--- DATA QUALITY REPORT ---")
print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])
print("Duplicate rows:", df.duplicated().sum())
print("\nData types:")
print(df.dtypes)




--- DATA QUALITY REPORT ---
Total rows: 48895
Total columns: 16
Duplicate rows: 0

Data types:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object


In [None]:
HANDLES MISSING VALUES

In [6]:
print("\nMissing values per column:")
print(df.isnull().sum())



Missing values per column:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [None]:
STORE ORIGINAL ROW COUNT

In [7]:
original_rows = df.shape[0]
print("Rows before cleaning:", original_rows)


Rows before cleaning: 48895


REMOVE DUPLICATE VALUES

In [9]:
df = df.drop_duplicates()
print("Duplicate rows removed!")


Duplicate rows removed!


OUTLINE DETECTION AND REMOVAL(IQR METHOD)

In [10]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df = df[(df[col] >= lower) & (df[col] <= upper)]

print("Outliers removed successfully!")


Outliers removed successfully!


STANDARDIZATION

In [11]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
print("Column names standardized!")


Column names standardized!


AFTER CLEANING SUMMARY

In [12]:
final_rows = df.shape[0]

print("\n--- CLEANING SUMMARY ---")
print("Rows before cleaning:", original_rows)
print("Rows after cleaning:", final_rows)
print("Rows removed:", original_rows - final_rows)



--- CLEANING SUMMARY ---
Rows before cleaning: 48895
Rows after cleaning: 20193
Rows removed: 28702


SAVE CLEANED DATASET

In [13]:
df.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved as cleaned_dataset.csv")


Cleaned dataset saved as cleaned_dataset.csv


FINAL PREVIEW

In [14]:
df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
10,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
20,7801,Sweet and Spacious Brooklyn Loft,21207,Chaya,Brooklyn,Williamsburg,40.71842,-73.95718,Entire home/apt,299,3,9,2011-12-28,0.07,1,0
24,8490,"MAISON DES SIRENES1,bohemian apartment",25183,Nathalie,Brooklyn,Bedford-Stuyvesant,40.68371,-73.94028,Entire home/apt,120,2,88,2019-06-19,0.73,2,233
25,8505,Sunny Bedroom Across Prospect Park,25326,Gregory,Brooklyn,Windsor Terrace,40.65599,-73.97519,Private room,60,1,19,2019-06-23,1.37,2,85
