In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os

In [2]:
class CarFeature:
    def __init__(self) -> None:
        self.name = None
        self.year = None
        self.sunroof = None
        self.fuel = None
        self.meter  = None
        self.glass = None
        self.gear  = None
        self.payment  = None
        self.engine = None
        self.license  = None
        self.passengers = None
        self.prev_owners = None
        self.wheel_drive = None
        self.original_use = None
        self.for_what  = None
        self.price = None

    def features_list(self):
        return [
        self.name, self.year, self.sunroof, self.fuel, self.original_use, self.license , 
        self.gear, self.glass, self.engine, self.meter, self.wheel_drive, self.payment, 
        self.for_what, self.prev_owners, self.passengers, self.price
        ]


In [4]:
data_list = []
NUM_FEATURE_TABLE = 12 
root_path = 'data/'

feature_mapping = {
    "نوع الوقود": "fuel",
    "أصل السيارة": "original_use",
    "رخصة السيارة": "license",
    "عدد الركاب": "passengers",
    "نوع الجير": "gear",
    "الزجاج": "glass",
    "قوة الماتور": "engine",
    "عداد السيارة": "meter",
    "الدفع": "wheel_drive",
    "وسيلة الدفع": "payment",
    "معروضة": "for_what",
    "أصحاب سابقون": "prev_owners"
}

In [5]:
def extract_feature(path: str) -> CarFeature:
    """
    Extracts car features from an HTML file and returns a CarFeature object.

    Parameters:
    - path (str): The path to the HTML file containing car information.

    Returns:
    - CarFeature: An object containing extracted car features.
    """

    # Initialize a CarFeature object
    car = CarFeature()

    # Read the HTML file and parse it with BeautifulSoup
    file_content = open(path, 'r', encoding='utf8').read()
    soup = BeautifulSoup(file_content, 'html.parser')

    # Extract basic information
    car.name = soup.find('table', 'driving-table').find('h3').get_text()
    car.year = int(soup.find('table', 'driving-table').find('h5').get_text().split(" ")[2])
    car.price = soup.find('h5', "post-price").get_text().split(" ")[0]
    
    
    # Extract basic sunroof
    sunroof = soup.find('li', string='فتحة سقف')
    
    if sunroof is None:
        car.sunroof = 0
    else:
        car.sunroof = 1

    # Extract additional features from the table
    table_feature = soup.find_all('tr', 'list-row')
    for index in range(len(table_feature)):
        row = table_feature[index]

        if row is None:
            continue

        # Extract feature and value
        feature = row.find('td').get_text()
        value = row.find('td').find_next_sibling()

        if feature is None or value is None:
            continue

        value = value.get_text()

        # Map the feature to the corresponding attribute and set the value
        if feature in feature_mapping:
            setattr(car, feature_mapping[feature], value)
            
            

    return car


In [6]:
for file_path in os.listdir(root_path):
    try:
        car = extract_feature(root_path+file_path)
    except Exception as e:
        continue
    
    data_list.append(car.features_list())

In [7]:
len(data_list)

6864

In [8]:
#create dataFrame of data
data_df = pd.DataFrame(data_list, columns=["name", "year", "sunroof", "fuel", "original_use", "license", "gear",
          "glass", "engine", "meter", "wheel_drive", "payment", 
          "for_what", "prev_owners", "passengers", "price"]
)


In [9]:
data_df


Unnamed: 0,name,year,sunroof,fuel,original_use,license,gear,glass,engine,meter,wheel_drive,payment,for_what,prev_owners,passengers,price
0,كيا اوبتيما,2014,1,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,2000,75000,,نقدا فقط,للبيع فقط,يد اولى,4+1,100000
1,كيا سورينتو,2007,1,ديزل,خصوصي,فلسطينية,نصف اوتوماتيك,الكتروني,2500,130000,,إمكانية التقسيط,للبيع أو التبديل,2,7+1,60000
2,هونداي افانتي,2006,0,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1600,,,نقدا فقط,للبيع فقط,,,43500
3,فيات 127,1982,0,بنزين,خصوصي,فلسطينية,عادي,يدوي,906,شغال,,إمكانية التقسيط,للبيع فقط,00,4+1,5500
4,بيجو 208,2014,0,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1200,38000,,نقدا فقط,للبيع فقط,,4+1,54000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6859,كيا مورننغ,2015,0,كهرباء,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1000,130000,,إمكانية التقسيط,للبيع فقط,2,4+1,45000
6860,فورد ترانزيت,2002,0,ديزل,عمومي,فلسطينية,عادي,الكتروني,2400,00000,دفع خلفي,نقدا فقط,للبيع فقط,5,7+1,48000
6861,بيجو بارتنر,2018,0,ديزل,خصوصي,فلسطينية,عادي,الكتروني,1600,50000,دفع أمامي,نقدا فقط,للبيع فقط,يد صفر,4+1,87000
6862,كيا سورينتو,2017,1,ديزل,خصوصي,فلسطينية,اوتوماتيك,الكتروني,2200,100,,نقدا فقط,للبيع فقط,ثانيه,6+1,126000


In [10]:
data_df.to_csv("cars_dataset.csv")

In [11]:
# Get concise summary information about the dataset
print(data_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6864 entries, 0 to 6863
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          6864 non-null   object
 1   year          6864 non-null   int64 
 2   sunroof       6864 non-null   int64 
 3   fuel          6864 non-null   object
 4   original_use  6864 non-null   object
 5   license       6864 non-null   object
 6   gear          6864 non-null   object
 7   glass         6864 non-null   object
 8   engine        6864 non-null   object
 9   meter         4923 non-null   object
 10  wheel_drive   1431 non-null   object
 11  payment       6864 non-null   object
 12  for_what      6864 non-null   object
 13  prev_owners   5214 non-null   object
 14  passengers    6087 non-null   object
 15  price         6864 non-null   object
dtypes: int64(2), object(14)
memory usage: 858.1+ KB
None
