In [28]:
import json
import pandas as pd
import numpy as np
import re

In [13]:
with open('buy_property_data.json', 'r') as f:
    buy_data = json.load(f)
raw_data = []
for k, v in buy_data.items():
    data = {'id': k}
    for key, value in v.items():
        data[key] = value
    raw_data.append(data)

In [243]:
df = pd.json_normalize(raw_data)

In [233]:
# cleaning price

def clean_price(x: str):
    cleaned_string = x.lower()
    cleaned_string = re.split(r"rs.|npr", re.split(r"/(rs(\.*)).*/g", cleaned_string)[-1])[-1]
    if cleaned_string.endswith('/-'):
        cleaned_string = cleaned_string[:-3]
    text_to_num = {'crores': '10000000', 'lakhs' : '100000', 'crore': '10000000', 'lakh' : '100000' }
    text = [x for x in text_to_num.keys() if cleaned_string.find(x) != -1]
    if len(text) != 0:
        cleaned_string = cleaned_string.split(text[0])[0] + " * "+ text_to_num[text[0]] +  cleaned_string.split(text[0])[1]
    return cleaned_string

def clean_str_to_int(raw: str):
    cleaned = "".join(re.findall("([0-9])", raw))
    if len(cleaned.strip()) == 0:
        return "0"
    return cleaned

def text_num_mul(raw: str):
    splt = raw.split("*")[0].strip()
    multiple = raw.split("*")[1].strip()
    if '-' in splt:
        splt = splt.split('-')[0]
    
    splt = clean_str_to_int(splt)
    multiple = clean_str_to_int(multiple)
    raw = float(splt) * float(multiple)
    return raw
  
def multiply_price(price: str, per_size: str):
    match = re.findall(r'aana| anna', price)
    per_size = str(per_size).lower()
    if len(match) > 0:
        sep = "".join(re.findall(r'/-|/|per', price))
        price = price.split(sep)[0]
        if 'anna' in per_size or 'aana' in per_size:
            ps_sep = re.findall(r'aana| anna|anna', per_size)[0]
            if '*' in price:
                price = text_num_mul(price)
                
            price =  clean_str_to_int(str(price))
            sep = re.findall(r'anna|aana', per_size)[0] 
            check = re.findall(r'-|.[0-9].', per_size.split(sep)[0])
            if len(check) > 0:
                if check.__contains__('-'):
                    per_size = per_size.split('-')[0]
                else:
                    per_size = per_size.split('.')[0]  
            price = float(price) *  float(per_size.split(ps_sep)[0])
           
        elif '*' in price:
            price = text_num_mul(price)
    else:
        price = float(clean_str_to_int(str(price)))

    return price

df['price'] = df['price'].apply(clean_price)
df['price'] = df.apply(lambda x: multiply_price(x['price'], x['property_details.land area']), axis=1).astype(np.float64)

In [260]:
# cleaning location
uniqueness = set()
def clean_location(location: str) -> str:
    if 'Location' in location:
        location = location.split('Location:')[-1]
    sep = re.findall(',|/', location)
    if len(sep) > 0:
        locations = location.split(sep[0])
        uniqueness.add(locations[0].strip())
    
df['location'] = df['location'].str.title()
df['location'].apply(clean_location)
print(uniqueness)

{'Karhmandu', 'Biratnagar Bajar', 'Sukedhara', 'Ramkot', 'Kalanki', 'Kathmndu', 'Narayanthan', 'Imadol', 'Rumba Chowk', 'Saranpur', 'Lalitpur', 'Satungal', 'Kathmadnu', 'Swayambhu', 'Bhaktapur', 'Nobel Hospital Road', 'Kathmandhu', 'Biratnagar', 'Kathmandu', 'Itahari', 'Budhanilkantha', 'Naicap'}


In [234]:
print(df[['price','property_details.land area']].head())

         price property_details.land area
0   10000000.0                     4 Anna
1   70000000.0                   1 Ropani
2  200000000.0                     5 Anna
3    5000000.0                    0.7.2.0
4    3600000.0                   3.2 Anna


In [76]:
df['property_details.house area']

0              NaN
1              NaN
2              NaN
3              NaN
4       3050 Sq.Ft
          ...     
367            NaN
368            NaN
369     2800 Sq.ft
370    3528 sq.ft.
371            NaN
Name: property_details.house area, Length: 372, dtype: object

In [104]:
dummy =  'Rs Rs 2500000/-per aana'
print(re.findall(r'aana| anna', dummy))

['aana']
