In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = r"--------"

# Read the file
df = pd.read_csv(file_path)

df.head(4)

Unnamed: 0,Link,Area,Year of Building,Number of Rooms,Elevator,Parking,Storage,Bale,Price,Tabaghe,Subtitle
0,https://divar.ir/v/%DB%B9%DB%B2%D9%85%D8%AA%D8...,۹۲,قبل از ۱۳۷۰,۲,آسانسور ندارد,پارکینگ,انباری ندارد,بله,۱۰٬۵۰۰٬۰۰۰٬۰۰۰ تومان,۲ از ۳,۴ روز پیش در تهران، گیشا (کوی نصر)
1,https://divar.ir/v/%D8%A7%D8%B3%D8%AA%D8%AB%D9...,۲۵۰,۱۳۹۷,۴,آسانسور,پارکینگ,انباری,بله,۵۰٬۰۰۰٬۰۰۰٬۰۰۰ تومان,۴,۲۲ ساعت پیش در تهران، سعادت‌آباد
2,https://divar.ir/v/%D8%B4%D9%87%D8%B1%DA%A9-%D...,۱۳۱,۱۳۹۶,۳,آسانسور,پارکینگ,انباری,بله,۲۸٬۰۰۰٬۰۰۰٬۰۰۰ تومان,۱,لحظاتی پیش در تهران، شهرک غرب
3,https://divar.ir/v/%D9%81%D8%B1%D9%88%D8%B4-%D...,۱۰۰,۱۳۹۹,۲,آسانسور,پارکینگ,انباری,بله,۱۲٬۰۰۰٬۰۰۰٬۰۰۰ تومان,۵,لحظاتی پیش در تهران، باغ فیض


# Real Estate Data Cleaning and Preprocessing

## Preparing the Dataset for Analysis and Modeling

This notebook documents the steps taken to clean and preprocess a real estate dataset. The dataset contains information about properties in Tehran, including area, price, number of rooms, and various amenities. The goal is to standardize the data, handle missing values, and prepare it for further analysis or machine learning modeling.

### Rename Columns, We also drop unnecessary columns like `Link` and `Bale`

In [3]:
# Drop unnecessary columns
df = df.drop(columns=['Link', 'Bale'])


In [4]:
# Rename columns
df.rename(columns={
    'Area': 'area_sqm',
    'Year of Building': 'building_year',
    'Number of Rooms': 'num_rooms',
    'Elevator': 'has_elevator',
    'Parking': 'has_parking',
    'Storage': 'has_storage',
    'Price': 'price(in_toman)',
    'Subtitle': 'district',
    'Tabaghe': 'floor_number'
}, inplace=True)

### Extract District Name

In [5]:
# Extract district name
df['district'] = df['district'].str.extract(r'در تهران، (.*)')


### Clean price column

In [6]:
# Clean price column
df['price(in_toman)'] = df['price(in_toman)'].str.replace(' تومان', '')

### Convert Persian/Arabic Numerals

In [7]:
# Mapping Persian/Arabic numerals to Western numerals
persian_to_western = {
    '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
    '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9',
    '٬': '',  # Remove thousands separators
    ' ': ''   # Remove spaces
}

# Function to convert Persian/Arabic numerals to Western numerals
def convert_to_western(text):
    if pd.isna(text):
        return np.nan
    return ''.join([persian_to_western[char] for char in text if char in persian_to_western])

# Apply conversion to relevant columns
df['price(in_toman)'] = df['price(in_toman)'].apply(convert_to_western)
df['area_sqm'] = df['area_sqm'].apply(convert_to_western)
df['num_rooms'] = df['num_rooms'].apply(convert_to_western)
df['building_year'] = df['building_year'].apply(convert_to_western)
df['floor_number'] = df['floor_number'].apply(convert_to_western)


### Convert Columns to Numeric

In [8]:
# Convert columns to numeric
df['price(in_toman)'] = pd.to_numeric(df['price(in_toman)'])
df['area_sqm'] = pd.to_numeric(df['area_sqm'])
df['num_rooms'] = pd.to_numeric(df['num_rooms'])
df['building_year'] = pd.to_numeric(df['building_year'])
df['floor_number'] = pd.to_numeric(df['floor_number'], errors='coerce')

### Handle Missing Values

In [9]:
# Fill missing values in num_rooms with the mean
mean_num_rooms = df['num_rooms'].mean()
df['num_rooms'] = df['num_rooms'].fillna(mean_num_rooms).astype(int)

### Standardize Categorical Columns

In [10]:
# Map categorical columns to standardized values
storage_mapping = {'انباری': 'Yes', 'انباری ندارد': 'No'}
elevator_mapping = {'آسانسور': 'Yes', 'آسانسور ندارد': 'No'}
parking_mapping = {'پارکینگ': 'Yes', 'پارکینگ ندارد': 'No'}

df['has_storage'] = df['has_storage'].map(storage_mapping)
df['has_elevator'] = df['has_elevator'].map(elevator_mapping)
df['has_parking'] = df['has_parking'].map(parking_mapping)

### Clean Floor Number Column

In [11]:
# Function to clean and extract floor number
def extract_floor_number(floor_str):
    # Check if the input is NaN or None
    if pd.isna(floor_str):
        return np.nan
    
    # Convert to string in case it's a float or other type
    floor_str = str(floor_str)
    
    # Handle specific cases
    if floor_str == 'همکف':
        return 0  # Ground floor
    if floor_str == 'زیرهمکف':
        return -1  # Basement
    
    # Clean the string and extract the floor number
    floor_str = floor_str.replace('+', '').replace('از بیشتر از', '')
    parts = floor_str.split('از')
    return parts[0].strip() if len(parts) > 1 else floor_str.strip()

# Apply floor number cleaning
df['floor_number'] = df['floor_number'].apply(extract_floor_number)
df['floor_number'] = pd.to_numeric(df['floor_number'], errors='coerce')

### Final DataFrame


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3233 entries, 0 to 3232
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area_sqm         3233 non-null   int64  
 1   building_year    3233 non-null   int64  
 2   num_rooms        3233 non-null   int64  
 3   has_elevator     3233 non-null   object 
 4   has_parking      3233 non-null   object 
 5   has_storage      3233 non-null   object 
 6   price(in_toman)  3233 non-null   int64  
 7   floor_number     2513 non-null   float64
 8   district         3233 non-null   object 
dtypes: float64(1), int64(4), object(4)
memory usage: 227.4+ KB


In [13]:
df.head()

Unnamed: 0,area_sqm,building_year,num_rooms,has_elevator,has_parking,has_storage,price(in_toman),floor_number,district
0,92,1370,2,No,Yes,No,10500000000,23.0,گیشا (کوی نصر)
1,250,1397,4,Yes,Yes,Yes,50000000000,4.0,سعادت‌آباد
2,131,1396,3,Yes,Yes,Yes,28000000000,1.0,شهرک غرب
3,100,1399,2,Yes,Yes,Yes,12000000000,5.0,باغ فیض
4,90,1402,2,Yes,Yes,Yes,5500000000,4.0,جی
