In [14]:
import pandas as pd
import numpy as np

In [15]:
# 1. Load Data
df = pd.read_csv("Dataset/results_cleaned.csv")
print("Data awal:")
print(df.head())

Data awal:
                                          house_name        location  \
0  Rumah 2 Lantai Bagus Sertifikat Hak Milik di M...  Andir, Bandung   
1                    Rumah Cantik @ Andir (Sudirman)  Andir, Bandung   
2  Jual Cepat Rumah di Maleber Rajawali Bisa Untu...  Andir, Bandung   
3  Rumah minimalis 3 lantai di andir Bandung coco...  Andir, Bandung   
4  Jual Rumah Murah Bisa Dijadikan Kos”an Di Jala...  Andir, Bandung   

   bedroom_count  bathroom_count  carport_count       price  land_area  \
0              3               2              2  2100000000        137   
1              3               2              3  4100000000        202   
2              5               2              1  3300000000        350   
3              2               2              1   580000000         30   
4             11               3              0  1300000000        176   

   building_area (m2)  
0                 170  
1                 300  
2                 258  
3              

In [16]:
# 2. Tampilkan Jumlah Data Hilang
print("\nJumlah missing value sebelum ditangani:")
print(df.isnull().sum())


Jumlah missing value sebelum ditangani:
house_name            0
location              0
bedroom_count         0
bathroom_count        0
carport_count         0
price                 0
land_area             0
building_area (m2)    0
dtype: int64


In [17]:
# 3. Hapus Duplikat
df.drop_duplicates(inplace=True)
print("\nJumlah data setelah menghapus duplikat:", df.shape)



Jumlah data setelah menghapus duplikat: (6992, 8)


In [18]:
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])


In [19]:
print("\nJumlah missing value setelah ditangani:")
print(df.isnull().sum())


Jumlah missing value setelah ditangani:
house_name            0
location              0
bedroom_count         0
bathroom_count        0
carport_count         0
price                 0
land_area             0
building_area (m2)    0
dtype: int64


In [20]:
# 5. Perbaiki Kesalahan Struktural (nama kolom)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [21]:
# 6. Saring Outlier dengan metode IQR
numerik = df.select_dtypes(include=[np.number]).columns
for col in numerik:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [22]:
print("\nJumlah data setelah menghapus outlier:", df.shape)


Jumlah data setelah menghapus outlier: (4780, 8)


In [23]:
# 7. Validasi Tipe Data
print("\nTipe data kolom:")
print(df.dtypes)


Tipe data kolom:
house_name            object
location              object
bedroom_count          int64
bathroom_count         int64
carport_count          int64
price                  int64
land_area              int64
building_area_(m2)     int64
dtype: object


In [24]:
# 8. Validasi Nilai Negatif (contoh untuk kolom 'bedroom_count')
if 'bedroom_count' in df.columns:
    df = df[df['bedroom_count'] >= 0]

In [25]:
# 9. Simpan hasil data cleaning
df.to_csv("Dataset/results_cleaned_final.csv", index=False)
print("\nDataset telah disimpan ke 'results_cleaned_final.csv'")


Dataset telah disimpan ke 'results_cleaned_final.csv'
