In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('DataSets/dataset2.csv')

# 1. Handle missing values in the 'Price' column by filling them with the median value
data['Price'].fillna(data['Price'].median(), inplace=True)

# 2. Convert the 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y', errors='coerce')

# 3. Handle missing values in the 'Distance' column (if any) - fill with median
if data['Distance'].isnull().sum() > 0:
    data['Distance'].fillna(data['Distance'].median(), inplace=True)

# 4. Handle missing values in other columns (if any) by filling them with the mode (for categorical columns)
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# 5. Remove outliers in 'Price' using the Interquartile Range (IQR) method
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the dataset to remove outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# 6. Remove any remaining missing data
data.dropna(inplace=True)

# 7. Save the cleaned dataset to a new CSV file
data.to_csv('cleaned_housing_data2.csv', index=False)

# Display a message
print("Dataset cleaned and saved as 'cleaned_housing_data2.csv'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Price'].fillna(data['Price'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


Dataset cleaned and saved as 'cleaned_housing_data.csv'.
