# 3- Marg the two databases before cleaning:

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import time

In [3]:
df_bayut = pd.read_csv('bayut_for_rent.csv')
df_opensooq = pd.read_csv('opensooq_for_rent.csv')


print("Bayut columns:", df_bayut.columns.tolist())
print("OpenSooq columns:", df_opensooq.columns.tolist())


df_merged = pd.concat([df_bayut, df_opensooq], ignore_index=True)


df_merged.to_csv('properties_combined_raw2.csv', index=False, encoding='utf-8-sig')

print(f"Combined data shape: {df_merged.shape}")
print("Combined file saved as: properties_combined_raw.csv")

Bayut columns: ['Title', 'Location', 'Price', 'Size', 'Listing_Type', 'Link']
OpenSooq columns: ['Title', 'Location', 'Price', 'Size', 'Listing_Type', 'Link']
Combined data shape: (6783, 6)
Combined file saved as: properties_combined_raw.csv


# 4- Now we can start cleaning the merged data.

In [31]:
df = pd.read_csv('properties_combined_raw2.csv')

In [32]:
df.head()

Unnamed: 0,Title,Location,Price,Size,Listing_Type,Link
0,"1 Bedroom Apartment For Rent Ruwi, Muscat","Ruwi, Muscat",150,70 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13012...
1,"1 Bedroom Apartment For Rent Al Hail, Muscat","Al Hail, Muscat",300,100 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13027...
2,"3 Bedrooms Villa For Rent Qurum, Muscat","Qurum, Muscat",750,300 Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
3,4 Bedrooms Villa For Rent Madinat As Sultan Qa...,"Madinat As Sultan Qaboos, Muscat",950,300 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13018...
4,"2 Bedrooms Apartment For Rent in Al Hamriyah, ...","Al Hamriyah, Muscat",250,100 Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...


In [33]:
df.columns


Index(['Title', 'Location', 'Price', 'Size', 'Listing_Type', 'Link'], dtype='object')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6783 entries, 0 to 6782
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         6783 non-null   object
 1   Location      6729 non-null   object
 2   Price         6783 non-null   object
 3   Size          5634 non-null   object
 4   Listing_Type  6783 non-null   object
 5   Link          6783 non-null   object
dtypes: object(6)
memory usage: 318.1+ KB


In [35]:
df.dtypes

Title           object
Location        object
Price           object
Size            object
Listing_Type    object
Link            object
dtype: object

In [36]:
df

Unnamed: 0,Title,Location,Price,Size,Listing_Type,Link
0,"1 Bedroom Apartment For Rent Ruwi, Muscat","Ruwi, Muscat",150,70 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13012...
1,"1 Bedroom Apartment For Rent Al Hail, Muscat","Al Hail, Muscat",300,100 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13027...
2,"3 Bedrooms Villa For Rent Qurum, Muscat","Qurum, Muscat",750,300 Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
3,4 Bedrooms Villa For Rent Madinat As Sultan Qa...,"Madinat As Sultan Qaboos, Muscat",950,300 Sq. M.,For Rent,https://www.bayut.om/en/property/details-13018...
4,"2 Bedrooms Apartment For Rent in Al Hamriyah, ...","Al Hamriyah, Muscat",250,100 Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
...,...,...,...,...,...,...
6778,500 m2 Studio Apartments for Rent in Muscat Al...,", Al Khuwair, 760832XX",180 OMR,500.0,For Rent,https://om.opensooq.com/en/search/266994539
6779,Fully Furnished 1BHK Flat for Rent – Bareeq Al...,", Qurm, 905099XX",440 OMR,90.0,For Rent,https://om.opensooq.com/en/search/266949055
6780,125 m2 2 Bedrooms Apartments for Rent in Musca...,", Ghala, 994886XX",230 OMR,125.0,For Rent,https://om.opensooq.com/en/search/265445187
6781,2BHK Apartment for rent - Bousher,", Bosher, 992220XX",270 OMR,80.0,For Rent,https://om.opensooq.com/en/search/267036337


In [37]:
print(df.isnull().sum())

Title              0
Location          54
Price              0
Size            1149
Listing_Type       0
Link               0
dtype: int64


In [38]:
df[df['Location'].isna()]

Unnamed: 0,Title,Location,Price,Size,Listing_Type,Link
1614,Unfurnished Monthly in Muscat Ansab,,110 OMR,,For Rent,https://om.opensooq.com/en/search/266950819
1621,Unfurnished Monthly in Muscat Al Mawaleh,,95 OMR,,For Rent,https://om.opensooq.com/en/search/264139289
1625,Semi Furnished Monthly in Muscat Al Mawaleh,,130 OMR,,For Rent,https://om.opensooq.com/en/search/264995631
1640,2 Bedroom Apartment in Al Ghubra - Bait Abdullah,,225 OMR,78.0,For Rent,https://om.opensooq.com/en/search/266842689
1641,3 Bedroom Apartment for Rent at Ruwi - Mudhaireb,,250 OMR,92.0,For Rent,https://om.opensooq.com/en/search/266842323
1642,Semi Furnished Monthly in Muscat Bosher,,110 OMR,,For Rent,https://om.opensooq.com/en/search/265881111
1643,Furnished 2 Bedroom Apartment with Maid Room a...,,340 OMR,92.0,For Rent,https://om.opensooq.com/en/search/266841347
1644,Apartment for Rent with 2 Bedrooms at Ruwi MBD...,,200 OMR,72.0,For Rent,https://om.opensooq.com/en/search/266841699
1648,Semi Furnished Monthly in Muscat Al-Hail,,90 OMR,,For Rent,https://om.opensooq.com/en/search/265880191
1649,Apartment for Rent with 1 Bedroom at Ruwi - REX,,140 OMR,63.0,For Rent,https://om.opensooq.com/en/search/266841753


In [39]:
def extract_location_from_title(title):
    
    import re
    match = re.search(r'(?:in|at)\s+(.+)$', str(title))
    if match:
        return match.group(1).strip()
    return np.nan


df['Location'] = df.apply(
    lambda row: extract_location_from_title(row['Title']) if pd.isna(row['Location']) else row['Location'],
    axis=1
)

In [40]:
df[df['Location'].isna()]

Unnamed: 0,Title,Location,Price,Size,Listing_Type,Link
4243,مبنى للايجار قريب من الميناء مواقف الى شارع,,"1,000 OMR",1.0,For Rent,https://om.opensooq.com/en/search/265986745


In [41]:
df = df.dropna(subset=['Location'])

In [42]:
print(df.isnull().sum())

Title              0
Location           0
Price              0
Size            1149
Listing_Type       0
Link               0
dtype: int64


In [43]:
df['Size_unit'] = df['Size'].str.extract(r'([^\d.]+)$')[0].str.strip()


cols = list(df.columns)
cols.insert(cols.index('Size') + 1, cols.pop(cols.index('Size_unit')))
df = df[cols]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Size_unit'] = df['Size'].str.extract(r'([^\d.]+)$')[0].str.strip()


In [44]:
print(df.isnull().sum())

Title              0
Location           0
Price              0
Size            1149
Size_unit       6782
Listing_Type       0
Link               0
dtype: int64


In [45]:
df['Size_unit'].fillna('Sq. M.', inplace=True) 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size_unit'].fillna('Sq. M.', inplace=True)


In [46]:
print(df.isnull().sum())

Title              0
Location           0
Price              0
Size            1149
Size_unit          0
Listing_Type       0
Link               0
dtype: int64


In [47]:
df['Size'].str.extract(r'([\d.]+)').astype(float)

Unnamed: 0,0
0,70.0
1,100.0
2,300.0
3,300.0
4,100.0
...,...
6778,500.0
6779,90.0
6780,125.0
6781,80.0


In [48]:
df

Unnamed: 0,Title,Location,Price,Size,Size_unit,Listing_Type,Link
0,"1 Bedroom Apartment For Rent Ruwi, Muscat","Ruwi, Muscat",150,70 Sq. M.,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13012...
1,"1 Bedroom Apartment For Rent Al Hail, Muscat","Al Hail, Muscat",300,100 Sq. M.,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13027...
2,"3 Bedrooms Villa For Rent Qurum, Muscat","Qurum, Muscat",750,300 Sq. M.,Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
3,4 Bedrooms Villa For Rent Madinat As Sultan Qa...,"Madinat As Sultan Qaboos, Muscat",950,300 Sq. M.,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13018...
4,"2 Bedrooms Apartment For Rent in Al Hamriyah, ...","Al Hamriyah, Muscat",250,100 Sq. M.,Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
...,...,...,...,...,...,...,...
6778,500 m2 Studio Apartments for Rent in Muscat Al...,", Al Khuwair, 760832XX",180 OMR,500.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266994539
6779,Fully Furnished 1BHK Flat for Rent – Bareeq Al...,", Qurm, 905099XX",440 OMR,90.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266949055
6780,125 m2 2 Bedrooms Apartments for Rent in Musca...,", Ghala, 994886XX",230 OMR,125.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/265445187
6781,2BHK Apartment for rent - Bousher,", Bosher, 992220XX",270 OMR,80.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/267036337


In [49]:
df.dtypes

Title           object
Location        object
Price           object
Size            object
Size_unit       object
Listing_Type    object
Link            object
dtype: object

In [50]:

df['Size'] = df['Size'].str.extract(r'([\d.]+)') 
df['Size'] = df['Size'].astype(float)            


In [51]:
df.dtypes

Title            object
Location         object
Price            object
Size            float64
Size_unit        object
Listing_Type     object
Link             object
dtype: object

In [53]:
df


Unnamed: 0,Title,Location,Price,Size,Size_unit,Listing_Type,Link
0,"1 Bedroom Apartment For Rent Ruwi, Muscat","Ruwi, Muscat",150,70.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13012...
1,"1 Bedroom Apartment For Rent Al Hail, Muscat","Al Hail, Muscat",300,100.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13027...
2,"3 Bedrooms Villa For Rent Qurum, Muscat","Qurum, Muscat",750,300.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
3,4 Bedrooms Villa For Rent Madinat As Sultan Qa...,"Madinat As Sultan Qaboos, Muscat",950,300.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13018...
4,"2 Bedrooms Apartment For Rent in Al Hamriyah, ...","Al Hamriyah, Muscat",250,100.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-12994...
...,...,...,...,...,...,...,...
6778,500 m2 Studio Apartments for Rent in Muscat Al...,", Al Khuwair, 760832XX",180 OMR,500.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266994539
6779,Fully Furnished 1BHK Flat for Rent – Bareeq Al...,", Qurm, 905099XX",440 OMR,90.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266949055
6780,125 m2 2 Bedrooms Apartments for Rent in Musca...,", Ghala, 994886XX",230 OMR,125.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/265445187
6781,2BHK Apartment for rent - Bousher,", Bosher, 992220XX",270 OMR,80.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/267036337


In [54]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
6778     True
6779     True
6780     True
6781    False
6782     True
Length: 6782, dtype: bool

In [55]:
duplicates = df[df.duplicated()]
display(duplicates)

Unnamed: 0,Title,Location,Price,Size,Size_unit,Listing_Type,Link
264,"2 Bedrooms Apartment For Rent in Qurum, Muscat","Qurum, Muscat",425,120.0,Sq. M.,For Rent,https://www.bayut.om/en/property/details-13022...
1969,100 m2 1 Bedroom Apartments for Rent in Dhofar...,", Salala, 943126XX",50 OMR,100.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266016495
1970,Residential Land for Rent in Dhofar Salala,", Salala, 990016XX",35 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/266873893
1971,عقود ايجار مكاتب مؤقتة وترخيص الانشطة Temporar...,", Amerat, 955469XX",75 OMR,20.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/263485983
1973,2 Bedrooms Chalet for Rent in Al Dakhiliya Bidbid,", Bidbid, 966934XX",40 OMR,220.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/259878571
...,...,...,...,...,...,...,...
6777,Semi Furnished room in Al khuwair,", Al Khuwair, 917923XX",120 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/266932071
6778,500 m2 Studio Apartments for Rent in Muscat Al...,", Al Khuwair, 760832XX",180 OMR,500.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266994539
6779,Fully Furnished 1BHK Flat for Rent – Bareeq Al...,", Qurm, 905099XX",440 OMR,90.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/266949055
6780,125 m2 2 Bedrooms Apartments for Rent in Musca...,", Ghala, 994886XX",230 OMR,125.0,Sq. M.,For Rent,https://om.opensooq.com/en/search/265445187


In [56]:
df.drop_duplicates(inplace=True)

In [57]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
6768    False
6769    False
6771    False
6772    False
6781    False
Length: 6105, dtype: bool

In [58]:
num_duplicates = df.duplicated().sum()
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [59]:
print(df.isnull().sum())

Title             0
Location          0
Price             0
Size            966
Size_unit         0
Listing_Type      0
Link              0
dtype: int64


In [60]:
df[df['Size'].isna()]

Unnamed: 0,Title,Location,Price,Size,Size_unit,Listing_Type,Link
1307,Residential Land for Rent in Dhofar Salala,", Salala, 990016XX",35 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/266873893
1311,Furnished Daily in Dhofar Salala,", Salala, 990838XX",50 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/266121233
1324,Furnished Daily in Al Dakhiliya Nizwa,", Nizwa, 994282XX",12 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/256817323
1328,Daily room rent. إيجار غرفة يومية في المعبيلة,", Al Maabilah, 780331XX",6 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/266009649
1341,Furnished Daily in Dhofar Salala,", Salala, 762200XX",15 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/265805595
...,...,...,...,...,...,...,...
6752,Unfurnished Monthly in Muscat Al Mawaleh,", Al Mawaleh, 929192XX",115 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/263335749
6755,Semi Furnished Monthly in Al Dakhiliya Manah,", Manah, 989927XX","1,500 OMR",,Sq. M.,For Rent,https://om.opensooq.com/en/search/264646661
6758,Unfurnished Monthly in Muscat Ghubrah,", Ghubrah, 920335XX",120 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/264361347
6765,Furnished Daily in Muscat Ghubrah,", Ghubrah, 772737XX",27 OMR,,Sq. M.,For Rent,https://om.opensooq.com/en/search/262616373


In [61]:

median_value = df['Size'].median()


df['Size'].fillna(median_value, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size'].fillna(median_value, inplace=True)


In [81]:
print(df.isnull().sum())

Title           0
Location        0
Price           0
Size            0
Size_unit       0
Listing_Type    0
dtype: int64


In [67]:

df['Location'] = df['Location'].str.replace(r'[\dxX]+', '', regex=True)
df['Location'] = df['Location'].str.replace(r'[,\s]+$', '', regex=True)    
df['Location'] = df['Location'].str.strip(', ')


In [78]:
 df= df.drop('Link', axis=1)


In [79]:
df

Unnamed: 0,Title,Location,Price,Size,Size_unit,Listing_Type
0,1 Bedroom Apartment For Rent,"Ruwi, Muscat",150,70.0,Sq. M.,For Rent
1,1 Bedroom Apartment For Rent,"Al Hail, Muscat",300,100.0,Sq. M.,For Rent
2,3 Bedrooms Villa For Rent,"Qurum, Muscat",750,300.0,Sq. M.,For Rent
3,4 Bedrooms Villa For Rent,"Madinat As Sultan Qaboos, Muscat",950,300.0,Sq. M.,For Rent
4,2 Bedrooms Apartment For Rent in,"Al Hamriyah, Muscat",250,100.0,Sq. M.,For Rent
...,...,...,...,...,...,...
6768,3-BEDROOM TOWNHOUSE [WA-155],Al Mouj,"1,100 OMR",1.0,Sq. M.,For Rent
6769,Exquisite 3-Bedroom Luxury Villa in the Heart ...,Al Mouj,"1,250 OMR",270.0,Sq. M.,For Rent
6771,Luxury furnished apartment for rent - primer L...,Azaiba,500 OMR,145.0,Sq. M.,For Rent
6772,100 m2 2 Bedrooms Apartments for Rent in Musca...,Al Maabilah,160 OMR,100.0,Sq. M.,For Rent


In [82]:
df.to_csv('cleaned_properties.csv', index=False, encoding='utf-8-sig')
