In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 
import numpy as np 
import re

def remove_KualaLumpur(data, column_location): 
    data[column_location] = data[column_location].str.rstrip(', Kuala Lumpur') 
    return data 

def convert_to_numeric(data, column_price): 
    data[column_price] = data[column_price].str.replace('RM', '', regex=False)  # Remove 'RM'
    data[column_price] = data[column_price].str.replace(',', '', regex=False)  # Remove commas
    data[column_price] = data[column_price].astype(float)  # Convert to float
    return data 

def count_room(data, column_room):
    # Replace "+" with empty string and convert the string to a number
    data[column_room] = data[column_room].str.replace('+', '', regex=False)
    # Convert to numeric values (invalid values will become NaN)
    data[column_room] = pd.to_numeric(data[column_room], errors='coerce')
    # Fill NaN values with 0 and convert to integer
    data[column_room] = data[column_room].fillna(0).astype(int)
    return data

def assign_numeric_to_bathrrom(data, column_bathroom):
    data[column_bathroom] = data[column_bathroom].fillna(data[column_bathroom].mean())
    data[column_bathroom] = data[column_bathroom].astype(int)
    return data

def assign_numeric_to_car(data, column_car):
    data[column_car] = data[column_car].fillna(data[column_car].mean())
    data[column_car] = data[column_car].astype(int)
    return data

def extract_size_numeric(data, size_column):
    def extract_number(text):
        if pd.isna(text):
            return None
        numbers = re.findall(r'[\d,]+', str(text))
        if numbers:
            return int(numbers[0].replace(',', ''))
        return None

    data[size_column] = data[size_column].apply(extract_number)
    # Calculate the mean of non-null values
    size_mean = data[size_column].mean()
    # Fill NaN values with the mean
    data[size_column] = data[size_column].fillna(size_mean)
    return data

def encode_furnishing(data, column_furnishing):
    # Create mapping dictionary
    furnishing_map = {
        'fully furnished': 3,
        'partly furnished': 2,
        'unfurnished': 1,
        'unknown': 0,
        np.nan: 0  # This will handle NaN values
    }
    
    # Convert to lowercase to handle any case variations
    data[column_furnishing] = data[column_furnishing].str.lower()
    
    # Apply mapping
    data[column_furnishing] = data[column_furnishing].map(furnishing_map)
    
    return data

def remove_missing_values(data): 
    # Find missing values 
    missing_values = data.isnull().sum() 
    # Remove rows with missing values 
    data = data.dropna() 
    # Print number of missing values removed 
    print("Removed {} missing values".format(missing_values.sum())) 
    return data 

def data_cleaning_pipeline(data_path, 
                         column_location, 
                         column_price, 
                         column_room, 
                         column_bathroom,
                         column_car,
                         size_column,
                         column_furnishing): 
    data = pd.read_csv(data_path) 
    data_clean_location = remove_KualaLumpur(data, column_location)
    data_clean_price = convert_to_numeric(data_clean_location, column_price)
    data_clean_room = count_room(data_clean_price, column_room)
    data_clean_bathroom = assign_numeric_to_bathrrom(data_clean_room, column_bathroom)
    data_clean_car = assign_numeric_to_car(data_clean_bathroom, column_car)
    data_clean_size = extract_size_numeric(data_clean_car, size_column)
    data_clean_furnishing = encode_furnishing(data_clean_size, column_furnishing)
    data_no_nulls = remove_missing_values(data_clean_furnishing) 
    return data_no_nulls 

clean_data = data_cleaning_pipeline("C:/Users/YOYYP/Downloads/Compressed/data_kaggle.csv/data_kaggle.csv", 
                                  'Location', 
                                  'Price', 
                                  'Rooms', 
                                  'Bathrooms',
                                  'Car Parks',
                                  'Size',
                                  'Furnishing') 

print('\nAfter preprocessing') 
clean_data.head()

Removed 273 missing values

After preprocessing


Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,1250000.0,21,3,2,Serviced Residence,1335.0,3
1,Damansara Heights,6800000.0,6,7,2,Bungalow,6900.0,2
2,Dutamas,1030000.0,3,4,2,Condominium (Corner),1875.0,2
4,Bukit Jali,900000.0,41,3,2,Condominium (Corner),1513.0,2
5,Taman Tun Dr Ismai,5350000.0,42,5,4,Bungalow,7200.0,2
