In [1]:
import pandas as pd
import numpy as np

class RealEstateDataset:

    def __init__(self):
        self.__data = None
        pass

    def load_data(self, file_path):
        """
        This function loads the data from the csv file and prints the first 5 rows.
        """
        try: 
            self.__data = pd.read_csv(file_path, parse_dates=['Date_Added'])
            print("File loaded successfully!")
            print("DataFrame shape:", self.__data.shape)
            print("First 5 rows:")
            print(self.__data.head(5))
        except Exception as e:
            print(f"Error loading data: {e}")
            self.__data = None

    def clean_data(self):
        """
        This function cleans the data by filling missing values with the mean, 
        median or custom values and handless negative values with median.
        """
        if self.__data is None:
            print("No data to clean")
            return
        
        try:
            # handling missing values for numerical columns
            numerical_cols = self.__data.select_dtypes(include = ['int64', 'float64']).columns
            self.__data[numerical_cols] = self.__data[numerical_cols].fillna(self.__data[numerical_cols].mean())
            print("Missing values for numerical columns filled with mean.")

            # handling negative values for numerical columns
            for col in numerical_cols:
                median_val = self.__data[col].median()
                self.__data.loc[self.__data[col] < 0, col] = median_val
            print("Negative values for numerical columns filled with median.")

            # handling missing values for categorical columns
            categorical_cols = self.__data.select_dtypes(include = ['object']).columns
            self.__data[categorical_cols] = self.__data[categorical_cols].fillna('Unknown')
            print("Missing values for categorical columns filled with 'Unknown'.")

            # handling missing values for date columns
            date_cols = self.__data.select_dtypes(include = ['datetime64[ns]']).columns
            self.__data[date_cols] = self.__data[date_cols].fillna('Unknown')
            print("Missing values for date columns filled with 'Unknown'.")

            print("Data cleaned successfully!")
        except Exception as e:
            print(f"Error cleaning data: {e}") 
            self.__data = None
        
        
    def describe_data(self):
        '''
        This function describes the data by calculating the mean, median, mode and distribution of property types.
        '''
        if self.__data is None:
            print("No data to describe.")
            return
        
        print("\nSummary Statistics:")

        print("\nMean Values:")
        print(self.__data.mean(numeric_only=True))
        
        print("\nMedian Values:")
        print(self.__data.median(numeric_only=True))

        print("\nMode Values:")
        print(self.__data.mode().iloc[0])

        print("\nDistribution of Property Types:")
        print(self.__data['Type'].value_counts())

        print("\nAverage Price by Property Type:")
        print(self.__data.groupby('Type')['Price'].mean())

        print("\nAverage Size (sqft) by Location:")
        print(self.__data.groupby('Location')['Size_sqft'].mean())


    def get_data(self):
        """
        Returns the internal DataFrame.
        """
        return self.__data
        
        

In [2]:
# Creating a instance of the RealEstateDataset class
file_path = 'data/housing_data.csv'
r1 = RealEstateDataset()


In [3]:
r1.load_data(file_path)

File loaded successfully!
DataFrame shape: (10000, 18)
First 5 rows:
  Property_ID     Location       Type        Price  Bedrooms  Bathrooms  \
0   PROP00001      Chicago      House   327539.000       2.0        3.0   
1   PROP00002        Miami      House   391371.000       5.0        3.0   
2   PROP00003  Los Angeles      Condo   318609.000       3.0        3.0   
3   PROP00004        Miami  Apartment   429625.000       1.0        3.0   
4   PROP00005        Miami      Condo  3016324.057       1.0        0.0   

     Size_sqft  Year_Built  Sold Date_Added Agent_ID Listing_Type  HOA_Fee  \
0  1572.000000        1959     0 2024-05-13  AGT0007     For Rent      0.0   
1  2197.000000        1970     0 2025-05-14  AGT0053     For Rent    300.0   
2  1630.000000        2019     1 2025-04-01  AGT0075     For Rent      NaN   
3  1233.000000        1983     0 2023-11-17  AGT0114     For Rent      NaN   
4  7467.705263        1961     0 2025-04-24  AGT0063     For Sale    300.0   

   Parking_

In [4]:
r1.clean_data()

Missing values for numerical columns filled with mean.
Negative values for numerical columns filled with median.
Missing values for categorical columns filled with 'Unknown'.
Missing values for date columns filled with 'Unknown'.
Data cleaned successfully!


In [5]:
r1.describe_data()


Summary Statistics:

Mean Values:
Price             316369.718452
Bedrooms               2.990842
Bathrooms              1.996421
Size_sqft           1528.793523
Year_Built          1985.707700
Sold                   0.193900
HOA_Fee              149.141801
Parking_Spaces         1.505400
Garage                 0.511400
Pool                   0.498600
Lot_Size_acres         0.327103
Days_on_Market       151.092700
dtype: float64

Median Values:
Price             308027.000000
Bedrooms               3.000000
Bathrooms              2.000000
Size_sqft           1525.026964
Year_Built          1986.000000
Sold                   0.000000
HOA_Fee              149.141801
Parking_Spaces         1.000000
Garage                 1.000000
Pool                   0.000000
Lot_Size_acres         0.300000
Days_on_Market       153.000000
dtype: float64

Mode Values:
Property_ID                 PROP00001
Location                     New York
Type                            House
Price                  

In [6]:
# Save the cleaned data to a new csv file
try: 
    r1.get_data().to_csv('data/cleaned_housing_data.csv', index=False)
    print("Data saved successfully!")
except Exception as e:
    print(f"Error saving data: {e}")

Data saved successfully!
