# Data Cleaning for AIRBNB New York City

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Load Airbnb NYC dataset into a pandas DataFrame
df = pd.read_csv("D:\Oasis-infobyte\AB_NYC_2019.csv")

# Display basic information about the original dataset
print("Original Dataset Info:")
print(df.info())

# Data Integrity: Ensure accuracy, consistency, and reliability
# (Additional checks or corrections specific to your Airbnb data may be required)
# Example: Convert date columns to datetime format
df['last_review'] = pd.to_datetime(df['last_review'])

# Missing Data Handling: Impute or make informed decisions
# Example: Impute missing values in numeric columns with mean
numeric_columns = df.select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Duplicate Removal
df.drop_duplicates(inplace=True)

# Standardization: Consistent formatting and units
# Example: Standardize numeric columns using Z-score normalization
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Outlier Detection: Identify and address outliers using Z-score
z_scores = stats.zscore(df[numeric_columns])
outliers = (abs(z_scores) > 3).all(axis=1)
df = df[~outliers]

# Save the cleaned dataset
df.to_csv('cleaned_airbnb_nyc_dataset.csv', index=False)

# Display information about the cleaned dataset
print("\nCleaned Dataset Info:")
print(df.info())


Original Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_revi