In [2]:
# Import necessary libraries
import pandas as pd  # Pandas is used for data manipulation and analysis
import numpy as np   # NumPy is used for numerical computations

dataset = pd.read_csv('melb_data.csv')
print("Original Dataset:")
print(dataset.head())  # Display the first 5 rows

Original Dataset:
       Suburb           Address  Rooms Type      Price Method SellerG  \
0  Abbotsford      85 Turner St      2    h  1480000.0      S  Biggin   
1  Abbotsford   25 Bloomburg St      2    h  1035000.0      S  Biggin   
2  Abbotsford      5 Charles St      3    h  1465000.0     SP  Biggin   
3  Abbotsford  40 Federation La      3    h   850000.0     PI  Biggin   
4  Abbotsford       55a Park St      4    h  1600000.0     VB  Nelson   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
1  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
2  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   
3  4/03/2017       2.5    3067.0  ...       2.0  1.0      94.0           NaN   
4  4/06/2016       2.5    3067.0  ...       1.0  2.0     120.0         142.0   

   YearBuilt  CouncilArea Lattitude  Longtitude             Re

In [3]:
print("\nMissing values before cleaning:")
print(dataset.isnull().sum())


Missing values before cleaning:
Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64


In [4]:
dataset = dataset.dropna()
print("\nDataset after dropping rows with missing values:")
print(dataset.isnull().sum())


Dataset after dropping rows with missing values:
Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64


In [7]:
dataset = dataset.drop_duplicates()
print("\nDataset after removing duplicates:")
print(dataset.head())


Dataset after removing duplicates:
   Suburb  Address  Rooms  Type      Price  Method  SellerG  Date  Distance  \
1       0     2790      2     0  1035000.0       1       18    41       2.5   
2       0     4520      3     0  1465000.0       3       18    42       2.5   
4       0     4882      4     0  1600000.0       4      112    43       2.5   
6       0     1027      3     0  1876000.0       1      112    46       2.5   
7       0     6101      2     0  1636000.0       1      112    50       2.5   

   Postcode  ...  Bathroom  Car  Landsize  BuildingArea  YearBuilt  \
1    3067.0  ...       1.0  0.0     156.0          79.0     1900.0   
2    3067.0  ...       2.0  0.0     134.0         150.0     1900.0   
4    3067.0  ...       1.0  2.0     120.0         142.0     2014.0   
6    3067.0  ...       2.0  0.0     245.0         210.0     1910.0   
7    3067.0  ...       1.0  2.0     256.0         107.0     1890.0   

   CouncilArea  Lattitude  Longtitude  Regionname  Propertycount  
1

In [8]:
# show datatypes
print("\nData types of each column:")
print(dataset.dtypes)


Data types of each column:
Suburb             int16
Address            int16
Rooms              int64
Type                int8
Price            float64
Method              int8
SellerG            int16
Date                int8
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea         int8
Lattitude        float64
Longtitude       float64
Regionname          int8
Propertycount    float64
dtype: object

Dataset after converting categorical variables to numerical:
   Suburb  Address  Rooms  Type      Price  Method  SellerG  Date  Distance  \
1       0     2790      2     0  1035000.0       1       18    41       2.5   
2       0     4520      3     0  1465000.0       3       18    42       2.5   
4       0     4882      4     0  1600000.0       4      112    43       2.5   
6       0     1027      3     0  1876000.0       1     

In [9]:
# Convert categorical variables to numerical
categorical_columns = dataset.select_dtypes(include=['object']).columns
for col in categorical_columns:
    dataset[col] = dataset[col].astype('category').cat.codes
print("\nDataset after converting categorical variables to numerical:")
print(dataset.head())


Dataset after converting categorical variables to numerical:
   Suburb  Address  Rooms  Type      Price  Method  SellerG  Date  Distance  \
1       0     2790      2     0  1035000.0       1       18    41       2.5   
2       0     4520      3     0  1465000.0       3       18    42       2.5   
4       0     4882      4     0  1600000.0       4      112    43       2.5   
6       0     1027      3     0  1876000.0       1      112    46       2.5   
7       0     6101      2     0  1636000.0       1      112    50       2.5   

   Postcode  ...  Bathroom  Car  Landsize  BuildingArea  YearBuilt  \
1    3067.0  ...       1.0  0.0     156.0          79.0     1900.0   
2    3067.0  ...       2.0  0.0     134.0         150.0     1900.0   
4    3067.0  ...       1.0  2.0     120.0         142.0     2014.0   
6    3067.0  ...       2.0  0.0     245.0         210.0     1910.0   
7    3067.0  ...       1.0  2.0     256.0         107.0     1890.0   

   CouncilArea  Lattitude  Longtitude  Reg

In [12]:
# Normalize numerical columns   
numerical_columns = dataset.select_dtypes(include=[np.number]).columns
for col in numerical_columns:
    dataset[col] = (dataset[col] - dataset[col].mean()) / dataset[col].std()
print("\nDataset after normalizing numerical columns:")
print(dataset.head())

dataset.to_csv('cleaned_melb_data.csv', index=False)
print("\nCleaned dataset saved to 'cleaned_dataset.csv'.")


Dataset after normalizing numerical columns:
     Suburb   Address     Rooms      Type     Price    Method   SellerG  \
1 -1.703481 -0.152273 -0.959147 -0.683004 -0.050104 -0.382538 -1.382474   
2 -1.703481  0.827068  0.070636 -0.683004  0.586785  1.367338 -1.382474   
4 -1.703481  1.031994  1.100418 -0.683004  0.786739  2.242275  0.253089   
6 -1.703481 -1.150295  0.070636 -0.683004  1.195533 -0.382538  0.253089   
7 -1.703481  1.722061 -0.959147 -0.683004  0.840060 -0.382538  0.253089   

       Date  Distance  Postcode  ...  Bathroom       Car  Landsize  \
1  1.045725 -1.292055 -0.404386  ... -0.810192 -1.692136 -0.351002   
2  1.115150 -1.292055 -0.404386  ...  0.595562 -1.692136 -0.375516   
4  1.184575 -1.292055 -0.404386  ... -0.810192  0.458525 -0.391116   
6  1.392850 -1.292055 -0.404386  ...  0.595562 -1.692136 -0.251832   
7  1.670551 -1.292055 -0.404386  ... -0.810192  0.458525 -0.239575   

   BuildingArea  YearBuilt  CouncilArea  Lattitude  Longtitude  Regionname  \
1   