**Data Extraction and Cleaning**

In [None]:
import pandas as pd
import re

Importing dataset

In [None]:
df = pd.read_csv('data_laptop_ebay_9_12.csv')

Price columns cleaning

In [None]:
df = df[df['current_price'].str.startswith('US $', na=False)] # keep only the lines with prices in $

In [None]:
df['current_price'] = df['current_price'].str.replace('US','')
df['current_price'] = df['current_price'].str.replace('/ea','')
df['current_price'] = df['current_price'].str.replace('$','')
df['current_price'] = df['current_price'].str.replace(',','')
df.current_price = pd.to_numeric(df.current_price)

Condition cleaning

In [None]:
df['etat'] = df['etat'].apply(lambda x: x[:len(x)//2])

CPU Speed cleaning

In [None]:
df.cpu_speed = df.cpu_speed.str.replace('GHz','')
df.cpu_speed = pd.to_numeric(df.cpu_speed)

Storage Type cleaning

In [None]:
def storage_type_clean(item):
    cleaned_item = item.split(')')[0] + ')'
    main_terms = ['ssd', 'hdd', 'emmc', 'nvme', 'ufs']
    for term in main_terms:
        if term in cleaned_item.lower():
            return term
    return cleaned_item
df.storage_type = df.storage_type.apply(storage_type_clean)

Operating System cleaning

In [None]:
def extract_os(desc):
    os_pattern = r'(windows \d+|chrome os|ios)'
    match = re.search(os_pattern, desc)
    return match.group(1) if match else None
df.os = df.os.apply(extract_os)

Brand, Model and Generation extract from Processor column

In [None]:
def extract_processor(desc):
    pattern = r'(intel|amd|qualcomm|mediatek)\s*(core|ryzen)?\s*(i[3-9]|core|athlon|snapdragon)?\s*(\d+th?\s*gen|\d{4})?(\s*\d{1,2}[a-zA-Z]{1,2})?'
    match = re.search(pattern, desc, re.IGNORECASE)
    if match:
        brand = match.group(1).lower() if match.group(1) else ''
        model = match.group(3).lower() if match.group(3) else ''
        generation = match.group(4).lower() if match.group(4) else ''
        return brand, model, generation
    return None, None, None
df[['cpu_brand', 'cpu_model', 'cpu_gen']] = df['processor'].apply(lambda x: pd.Series(extract_processor(x)))

GPU cleaning

In [None]:
def extract_gpu(desc):
    pattern = r'(nvidia|amd|intel|mediatek|qualcomm|powervr|ati)'
    match = re.search(pattern, desc, re.IGNORECASE)
    if match:
        gpu_brand = match.group(1).strip().lower() if match.group(1) else ''
        return gpu_brand
    return None
df['gpu'] = df.gpu.apply(extract_gpu)

Converting HDD and SSD to Go

In [None]:
def change_ssd(n):
  if n=='1 tb': return '1024'
  elif n=='2 tb': return '2048'
  else: return n
df['storage_ssd'] = df.storage_ssd.apply(change_ssd)

In [None]:
def change_hdd(n):
  if n=='4 tb': return '4096'
  elif n=='2 tb': return '2048'
  elif n=='1 tb': return '1024'
  else: return n
df['storage_hdd'] = df.storage_hdd.apply(change_hdd)

In [None]:
df.storage_hdd = df.storage_hdd.str.replace('gb','')
df.storage_ssd = df.storage_ssd.str.replace('gb','')
df.storage_hdd = pd.to_numeric(df.storage_hdd)
df.storage_ssd = pd.to_numeric(df.storage_ssd)

In [None]:
df = df.dropna()

Keeping necessary columns

In [None]:
data = df[['brand','ram','storage_ssd','storage_hdd','storage_type','cpu_brand','cpu_speed','etat','os','screen_size','gpu','current_price']]

After reviewing the dataset, I found that some laptops have RAM listed in place of HDD

In [None]:
data.loc[data['ram'] == data['storage_hdd'], 'storage_hdd'] = 0

After consulting the website, I found that some laptops that have 512 RAM they have 16 in ram not 512.

In [None]:
data.loc[data['ram'] == 512, 'ram'] = 16

In [None]:
data = data[data.storage_type != 'm)']

I set the 'ram' value to 4 for all rows where the 'storage_type' is 'emmc' because, after consulting the site, I found that most Chromebooks with eMMC storage typically have 4GB of RAM.

In [None]:
data.loc[data['storage_type'] == 'emmc', 'ram'] = 4

Add new column

In [None]:
data['storage'] = data.storage_hdd + data.storage_ssd

Saving dataset

In [None]:
data.to_csv('dataset_20_12_no_encoder.csv')