In [1]:
import pandas as pd
import numpy as np
from scipy import stats


# Load the dataset

In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

# Initial inspection

In [3]:
print("Initial Inspection:")
print(df.head())
print(df.info())
print(df.describe())

Initial Inspection:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room    149               1   

# Data Integrity: Check for data type inconsistencies

In [4]:
print("\nData Types:")
print(df.dtypes)


Data Types:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object


# Missing Data Handling

In [5]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


# Decide on the strategy for missing values

For simplicity, we will fill numerical columns with median and categorical columns with mode

In [6]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        df[column].fillna(df[column].median(), inplace=True)


In [7]:
print("\nMissing Values After Handling:")
print(df.isnull().sum())


Missing Values After Handling:
id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


# Duplicate Removal

In [8]:
print("\nDuplicates:")
print(df.duplicated().sum())


Duplicates:
0


# Remove duplicates

In [9]:
df.drop_duplicates(inplace=True)
print("\nDuplicates After Removal:")
print(df.duplicated().sum())



Duplicates After Removal:
0


# Standardize string formats

In [10]:
string_columns = ['name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'room_type']
for column in string_columns:
    df[column] = df[column].str.lower().str.strip()


# Outlier Detection

In [11]:
numeric_columns = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(stats.zscore(df[numeric_columns]))


In [12]:
outliers = (z_scores > 3).all(axis=1)
print(f'\nOutliers Detected: {outliers.sum()}')



Outliers Detected: 0


# Remove outliers

In [13]:
df = df[~outliers]

# Final Review

In [14]:
print("\nFinal Review:")
print(df.info())
print(df.describe())



Final Review:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48895 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48895 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review      