## Load the Dataset

In [1]:
import pandas as pd

# Load the dataset
ucl_data_path = '../Data/UCI_Real_Estate_Valuation.xlsx'
ucl_data = pd.read_excel(ucl_data_path)


## Data cleaning

In [2]:
# Check for missing values
print("Missing values in each column:\n", ucl_data.isnull().sum())

# Check for any obvious data inconsistencies
print("\nData description:\n", ucl_data.describe())


Missing values in each column:
 No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

Data description:
                No  X1 transaction date  X2 house age  \
count  414.000000           414.000000    414.000000   
mean   207.500000          2013.148953     17.712560   
std    119.655756             0.281995     11.392485   
min      1.000000          2012.666667      0.000000   
25%    104.250000          2012.916667      9.025000   
50%    207.500000          2013.166667     16.100000   
75%    310.750000          2013.416667     28.150000   
max    414.000000          2013.583333     43.800000   

       X3 distance to the nearest MRT station  \
count                        

## Normalize the data

In [3]:
from sklearn.preprocessing import MinMaxScaler

# initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# assuming the first column 'No' is just an identifier and should be excluded
features = ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
X = ucl_data[features]

# normalize the features
ucl_data_normalized = pd.DataFrame(min_max_scaler.fit_transform(ucl_data), columns=ucl_data.columns)



## Save the normalization 

In [4]:
# Save the normalized dataset to a new CSV file
ucl_data_normalized.to_csv('UCI_Real_Estate_normalized.csv', index=False)
print("Normalized dataset saved as 'UCI_Real_Estate_normalized.csv'.")

Normalized dataset saved as 'UCI_Real_Estate_normalized.csv'.


## Verify the files

In [5]:
# load and display the first few rows of the normalized data file to verify
normalized_data_loaded = pd.read_csv('UCI_Real_Estate_normalized.csv')
print("Normalized Data:")
print(normalized_data_loaded.head())

Normalized Data:
         No  X1 transaction date  X2 house age  \
0  0.000000             0.272727      0.730594   
1  0.002421             0.272727      0.445205   
2  0.004843             1.000000      0.303653   
3  0.007264             0.909091      0.303653   
4  0.009685             0.181818      0.114155   

   X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                0.009513                              1.0   
1                                0.043809                              0.9   
2                                0.083315                              0.5   
3                                0.083315                              0.5   
4                                0.056799                              0.5   

   X5 latitude  X6 longitude  Y house price of unit area  
0     0.616941      0.719323                    0.275705  
1     0.584949      0.711451                    0.314832  
2     0.671231      0.758896        