In [1]:
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',False)
pd.set_option('precision',2)

In [2]:
df = pd.read_csv('jakarta_houseprice.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2367 entries, 0 to 2366
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       2367 non-null   int64  
 1   desc             2152 non-null   object 
 2   fasilitas_total  1700 non-null   object 
 3   kamar            2152 non-null   object 
 4   km               2152 non-null   object 
 5   lantai           1974 non-null   float64
 6   lb               2152 non-null   float64
 7   lokasi           2152 non-null   object 
 8   lt               2152 non-null   float64
 9   price            2152 non-null   object 
 10  sertifikat       2062 non-null   object 
 11  tipe             2152 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 222.0+ KB


In [4]:
df = df.dropna(subset = ['price'])
df['price']=df[df['price']!='agen']['price'].apply(lambda x: int(x.replace('.','')))

In [5]:
df = df.drop_duplicates(subset = 'desc')

In [6]:
df = df.dropna(how = 'all')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1961 entries, 0 to 2366
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1961 non-null   int64  
 1   desc             1961 non-null   object 
 2   fasilitas_total  1512 non-null   object 
 3   kamar            1961 non-null   object 
 4   km               1961 non-null   object 
 5   lantai           1786 non-null   float64
 6   lb               1961 non-null   float64
 7   lokasi           1961 non-null   object 
 8   lt               1961 non-null   float64
 9   price            1961 non-null   int64  
 10  sertifikat       1875 non-null   object 
 11  tipe             1961 non-null   object 
dtypes: float64(3), int64(2), object(7)
memory usage: 199.2+ KB


In [8]:
def garasi(df):
    df_copy = df.copy()
    df_copy['desc'] = df_copy['desc'].apply(lambda x:str(x).lower())
    df_copy['garasi_desc'] = df_copy['desc'].apply(
        lambda x:0 if ('tidak ada garasi' in x) else 1 if ('carport' in x) or ('garasi' in x) \
        else 0
    )
    df_copy['garasi_fasilitas'] = np.zeros(len(df_copy))
    for i in range(len(df_copy)):
        if 'garasi' in str(df_copy['fasilitas_total'].iloc[i]).lower():
            df_copy['garasi_fasilitas'].iloc[i] = 1
        elif 'carport' in str(df_copy['fasilitas_total'].iloc[i]).lower():
            df_copy['garasi_fasilitas'].iloc[i] = 1
    df_copy['ada_garasi'] = np.where((df_copy['garasi_desc'] > 0),
                                     1,np.where(df_copy['garasi_fasilitas'] > 0 ,1 ,0)
                                    )
    
    return df_copy.drop(['garasi_desc','garasi_fasilitas'],axis=1)

In [9]:
def tipe_property(df):
    df_copy = df.copy()
    df_copy['desc'] = df_copy['desc'].apply(lambda x:str(x).lower())
    df_copy['tipe_property'] = df_copy['desc'].apply(
        lambda x:'Rumah' if ('rumah' in x) else 'Ruko' if ('ruko' in x) else\
        'Apartemen' if ('apartemen'  in x) or ('apt' in x) else "lainnya")
    return df_copy

In [10]:
def taman(df):
    df_copy = df.copy()
    df_copy['desc'] = df_copy['desc'].apply(lambda x:str(x).lower())
    df_copy['taman'] = df_copy['desc'].apply(
        lambda x:1 if ('taman' in x) or ('garden' in x) or ('pekarangan' in x) else 0
    )
    return df_copy

In [11]:
def pool(df):
    df_copy = df.copy()
    df_copy['desc'] = df_copy['desc'].apply(lambda x:str(x).lower())
    df_copy['pool'] = df_copy['desc'].apply(
        lambda x:1 if ('pool' in x) or ('kolam renang' in x) else 0
    )
    return df_copy

In [12]:
def listrik(df):
    df_copy = df.copy()
    df_copy['electricity'] = np.zeros(len(df_copy))
    df_copy['desc'] = df_copy['desc'].fillna('aaaa')
    for i in range(len(df_copy)):
        text = str(df_copy['desc'].iloc[i]).lower())
        try:
            cari = re.search('listrik',text)
            S = cari.span()
            txt = text[S[1]:S[1]+10]
            clean = ''.join(re.findall(r'[^\D]+',txt))
            df_copy['electricity'].iloc[i] = clean
        except:
            pass
    return df_copy

In [13]:
def gym(df):
    df_copy = df.copy()
    df_copy['desc'] = df_copy['desc'].apply(lambda x:str(x).lower())
    df_copy['gym'] = df_copy['desc'].apply(
        lambda x:1 if ('gym' in x) else 0
    )
    return df_copy

In [14]:
df = garasi(df)

In [15]:
df = tipe_property(df)

In [16]:
df = taman(df)

In [17]:
df = pool(df)

In [18]:
df = gym(df)

In [19]:
df = listrik(df)

In [20]:
df['sertifikat'] = np.where(df['sertifikat'].isna(),0,1)

In [21]:
kecamatan={
'Cempaka Putih' : ['Cempaka Putih Barat','Cempaka Putih Timur', 'Rawasari','Rawa Sari'],
'Gambir' : ['Cideng','Duri Pulo', 'Gambir', 'Kebon Kelapa', 'Petojo Selatan', 'Petojo Utara',
           'Roxy','Petojo'],
'Johar Baru' : ['Galur', 'Johar Baru', 'Kampung Rawa', 'Tanah Tinggi'],
'Kemayoran' : ['Cempaka Baru', 'Gunung Sahari Selatan', 'Harapan Mulya', 'Kebon Kosong',
               'Kemayoran', 'Serdang', 'Sumur Batu', 'Utan Panjang'],
'Menteng' : ['Cikini', 'Gondangdia', 'Kebon Sirih', 'Menteng', 'Pegangsaan'],
'Sawah Besar': ['Gunung Sahari Utara', 'Karang Anyar', 'Kartini', 'Kampung Anyar',
                'Mangga Dua Selatan', 'Pasar Baru', 'Gunung Sahari'],
'Senen' : ['Bungur', 'Kenari', 'Kramat', 'Kwitang', 'Paseban', 'Senen'],
'Tanah Abang' : ['Bendungan Hilir', 'Gelora', 'Kampung Bali', 'Karet Tengsin', 
                 'Kebon Kacang', 'Kebon Melati', 'Petamburan', 'KH Mas Mansyur',
                'Kebon Kacang Lama'],
'Cilincing' : ['Cilincing', 'Kalibaru', 'Marunda', 'Rorotan','Suka Pura', 'Semper',
                'Semper Barat', 'Semper Timur', 'Sukapura'],
'Kelapa Gading' : ['Kelapa Gading Barat', 'Kelapa Gading Timur', 'Pegangsaan Dua'],
'Koja' : ['Koja', 'Lagoa', 'Rawa Badak Selatan', 'Rawa Badak Utara','Tugu',
          'Tugu Selatan', 'Tugu Utara','Rawabadak Selatan'],
'Pademangan' : ['Ancol', 'Pademangan Barat', 'Pademangan Timur'],
'Penjaringan' : ['Kamal Muara', 'Kapuk Muara', 'Pejagalan', 'Penjaringan', 'Pluit',
                'Pantai Indah Kapuk','Bandengan','Teluk Gong','Muara Karang'],
'Tanjung Priok' : ['Kebon Bawang', 'Papanggo', 'Sungai Bambu', 'Sunter Agung', 'Sunter',
                    'Sunter Jaya', 'Tanjung Priok', 'Warakas'],
'Cakung' : ['Cakung Barat', 'Cakung Timur', 'Jatinegara', 'Penggilingan', 
            'Pulo Gebang', 'Rawa Terate', 'Ujung Menteng'],
'Cipayung' : ['Bambu Apus', 'Ceger', 'Cilangkap', 'Cipayung', 'Lubang Buaya',
              'Munjul', 'Pondok Ranggon', 'Setu'],
'Ciracas' : ['Cibubur', 'Ciracas', 'Kelapa Dua Wetan', 'Rambutan Susukan','Kampung Rambutan'],
'Duren Sawit' : ['Duren Sawit', 'Klender', 'Malaka Jaya', 'Malaka Sari', 
                 'Pondok Bambu', 'Pondok Kelapa', 'Pondok Kopi'],
'Jatinegara' :['Bali Mester', 'Bidara Cina', 'Cipinang Besar Selatan', 'Cipinang Besar Utara', 
                'Cipinang Cempedak', 'Cipinang Muara', 'Kampung Melayu', 'Rawa Bunga'],
'Kramat Jati': ['Balekambang', 'Batu Ampar', 'Cawang', 'Cililitan',
                'Dukuh', 'Kramat Jati', 'Tengah', 'Condet'],
'Makasar' : ['Cipinang Melayu', 'Halim Perdana Kusuma', 'Kebon Pala', 'Makasar', 
             'Pinang Ranti','Halim Perdana Kusumah'],
'Matraman' : ['Kayu Manis', 'Kebon Manggis', 'Pal Meriam', 'Pisangan Baru', 
              'Utan Kayu Selatan', 'Utan Kayu Utara','Utan Kayu'],
'Pasar Rebo' : ['Baru', 'Cijantung', 'Gedong', 'Kalisari', 'Pekayon'],
'Pulo Gadung' : ['Cipinang', 'Jati', 'Jatinegara Kaum', 'Kayu Putih', 
                  'Pisangan Timur', 'Pulo Gadung', 'Rawamangun'],
'Cilandak' : ['Cilandak Barat', 'Cipete Selatan', 'Gandaria Selatan', 'TB Simatupang', 'Cipete',
              'Lebak Bulus', 'Pondok Labu', 'Fatmawati'],
'Jagakarsa' :  ['Ciganjur', 'Cipedak', 'Jagakarsa', 'Lenteng Agung', 
                'Srengseng Sawah', 'Tanjung Barat'],
'Kebayoran Baru' : ['Cipete Utara', 'Gandaria Utara', 'Gunung', 'Kramat Pela', 'Radio Dalam',
                    'Melawai', 'Petogogan', 'Pulo', 'Rawa Barat', 'Selong', 'Senayan',
                   'Antasari','Panglima Polim','Gandaria'],
'Kebayoran Lama' : ['Cipulir', 'Grogol Selatan', 'Grogol Utara', 'Tanah Kusir', 'Permata Hijau',
                    'Kebayoran Lama Selatan', 'Kebayoran Lama Utara', 'Pondok Pinang',
                   'Pondok Indah'],
'Mampang Prapatan' : ['Bangka', 'Kuningan Barat', 'Mampang Prapatan','Kemang',
                      'Pela Mampang', 'Tegal Parang','Warung Buncit'],
'Pancoran' : ['Cikoko', 'Duren Tiga', 'Kalibata', 'Pancoran', 'Pengadegan', 'Rawajati'],
'Pasar Minggu' : ['Cilandak Timur', 'Jati Padang', 'Kebagusan', 'Pejaten', 'Ampera',
                  'Pasar Minggu', 'Pejaten Barat', 'Pejaten Timur', 'Ragunan'],
'Pesanggrahan' : ['Bintaro', 'Pesanggrahan', 'Petukangan Selatan', 'Petukangan',
                  'Petukangan Utara', 'Ulujami'],
'Setiabudi' : ['Guntur', 'Karet Kuningan', 'Karet Semanggi', 'Karet', 'Kuningan Timur',
               'Menteng Atas', 'Pasar Manggis', 'Setiabudi','Kuningan','SetiaBudi',
               'Setia Budi'],
'Tebet' : ['Bukit Duri', 'Kebon Baru', 'Manggarai Selatan', 'Manggarai', 'MT Haryono',
           'Menteng Dalam', 'Tebet Barat', 'Tebet Timur','Casablanca'],
'Cengkareng' : ['Cengkareng Barat', 'Cengkareng Timur', 'Duri Kosambi', 
                'Kapuk', 'Kedaung Kali Angke', 'Rawa Buaya','Daan Mogot'],
'Grogol Petamburan' : ['Grogol', 'Jelambar Baru', 'Jelambar', 'Tanjung Duren Selatan', 
                       'Tanjung Duren Utara', 'Tomang', 'Wijaya Kusuma', 'Tanjung Duren',
                      'Wijaya Kesuma'],
'Taman Sari' : ['Glodok', 'Keagungan', 'Krukut', 'Mangga Besar', 'Maphar','Kota', 
                'Pinangsia', 'Taman Sari', 'Tangki','Hayam Wuruk','Gajah Mada'],
'Tambora' : ['Angke', 'Duri Selatan', 'Duri Utara', 'Jembatan Besi', 'Jembatan Lima',
             'Kali Anyar', 'Krendang', 'Pekojan', 'Roa Malaka', 'Tambora', 'Tanah Sereal'],
'Kebon Jeruk' : ['Duri Kepa', 'Kebon Jeruk', 'Kedoya Selatan', 'Kedoya Utara','Sunrise Garden',
                 'Kelapa Dua', 'Sukabumi Selatan', 'Sukabumi Utara','Pos Pengumben',
                'Kedoya'],
'Kalideres' : ['Kalideres', 'Kamal', 'Pegadungan', 'Semanan', 'Tegal Alur'],
'Palmerah' : ['Jatipulo', 'Kemanggisan', 'Kota Bambu Selatan', 'Kota Bambu','Jati Pulo',
              'Kota Bambu Utara', 'Palmerah', 'Slipi'],
'Kembangan' : ['Joglo', 'Kembangan Selatan', 'Kembangan Utara', 'Puri Indah',
               'Meruya Selatan', 'Meruya Utara', 'Srengseng','Meruya']
}

In [22]:
for i in range(len(df)):
    for j in kecamatan.keys():
        if df['lokasi'].iloc[i] in kecamatan[j]:
            df['lokasi'].iloc[i] = j
            break
df['lokasi'].nunique()

51

In [23]:
temp = [i for i in df['lokasi'].unique()]
for i in temp:
    if i not in kecamatan.keys():
        print(i)

Bekasi Barat
Pondok Melati
Karang Tengah
Pinang (Penang)
Pondok Gede
Cipondoh
Larangan
Ciledug
Medan Satria
Benda


In [24]:
#Pondok Melati,Ciledug,Bekasi Barat,Karang Tengah,Larangan,Pondok Gede,Cipondoh,Medan Satria
#Benda bukan di jakarta

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1961 entries, 0 to 2366
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1961 non-null   int64  
 1   desc             1961 non-null   object 
 2   fasilitas_total  1512 non-null   object 
 3   kamar            1961 non-null   object 
 4   km               1961 non-null   object 
 5   lantai           1786 non-null   float64
 6   lb               1961 non-null   float64
 7   lokasi           1961 non-null   object 
 8   lt               1961 non-null   float64
 9   price            1961 non-null   int64  
 10  sertifikat       1961 non-null   int64  
 11  tipe             1961 non-null   object 
 12  ada_garasi       1961 non-null   int64  
 13  tipe_property    1961 non-null   object 
 14  taman            1961 non-null   int64  
 15  pool             1961 non-null   int64  
 16  gym              1961 non-null   int64  
 17  electricity   

In [26]:
df = df[['km','kamar','lb','lt','tipe_property','price','sertifikat','desc',
     'lokasi','ada_garasi','pool','taman','electricity','lantai','gym']]

In [27]:
df.columns = ['kamar_mandi','kamar','lb','lt','tipe_property','price','sertifikat','deskripsi',
     'lokasi','ada_garasi','pool','taman','electricity','floors_total','gym']

In [28]:
not_include =['Pondok Melati', 'Ciledug', 'Bekasi Barat', 'Karang Tengah', 'Larangan',
    'Pondok Gede', 'Cipondoh', 'Medan Satria', 'Benda', 'Pinang (Penang)']

In [29]:
df = df[~df['lokasi'].isin(not_include)]

In [30]:
df

Unnamed: 0,kamar_mandi,kamar,lb,lt,tipe_property,price,sertifikat,deskripsi,lokasi,ada_garasi,pool,taman,electricity,floors_total,gym
0,3,7,220.0,120.0,Rumah,2800000000,1,"['dijual rumah 2.5 lantai', 'kamar tidur 7', '...",Kelapa Gading,1,0,0,0,3.0,0
1,1,1,40.0,21.0,lainnya,695000000,1,"['spesifikasi :', 'ukuran 3.5 x 6', 'listrik 2...",Cempaka Putih,0,0,0,2200,2.0,0
2,7,>10,387.0,299.0,Rumah,5900000000,0,"['nego sampe deal', 'usaha kost', 'luas tanah ...",Pancoran,1,0,0,5000,3.0,0
3,4,4,209.0,209.0,Apartemen,4300000000,1,"['kode listingan : rj3133at20\t', '( mohon seb...",Kebon Jeruk,0,0,0,0,2.0,0
4,2,3,42.0,42.0,Apartemen,850000001,1,['dijual apartemen 3 br bassura city jakarta t...,Cakung,0,0,1,0,16.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,3,3,215.0,324.0,Rumah,3099000000,1,"['kode listing : iqb 010 rj', none, 'di jual r...",Pesanggrahan,1,0,0,6600,2.0,0
2361,2,3,48.0,60.0,lainnya,600000088,1,"['- boston residence bintaro (739 juta nego)',...",Cilandak,1,0,0,0,2.0,0
2362,2,3,110.0,65.0,Rumah,1500000000,1,"['vine property ', none, 'for sale!!', 'rumah ...",Pasar Minggu,1,0,0,0,2.0,0
2363,2,3,91.0,61.0,Rumah,1280000000,1,"['deskripsi', none, 'miliki hunian rumah denga...",Pancoran,1,0,0,0,2.0,0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1923 entries, 0 to 2366
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kamar_mandi    1923 non-null   object 
 1   kamar          1923 non-null   object 
 2   lb             1923 non-null   float64
 3   lt             1923 non-null   float64
 4   tipe_property  1923 non-null   object 
 5   price          1923 non-null   int64  
 6   sertifikat     1923 non-null   int64  
 7   deskripsi      1923 non-null   object 
 8   lokasi         1923 non-null   object 
 9   ada_garasi     1923 non-null   int64  
 10  pool           1923 non-null   int64  
 11  taman          1923 non-null   int64  
 12  electricity    1923 non-null   object 
 13  floors_total   1750 non-null   float64
 14  gym            1923 non-null   int64  
dtypes: float64(3), int64(6), object(6)
memory usage: 240.4+ KB


In [32]:
df.to_csv('csv_(agak)bersih.csv')