**Model Training and Evaluation**


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load the dataset
data = pd.read_csv("Datasets/laptop_price.csv", encoding="cp1252")
data

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [2]:
# Remove unnecessary columns if they exist
cols_to_drop = ["Unnamed: 0", "laptop_id", "product"]
data = data.drop(columns=[col for col in cols_to_drop if col in data.columns])
cols_to_drop

['Unnamed: 0', 'laptop_id', 'product']

In [3]:
# Display column names
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [4]:
# Standardise column names
data.columns = data.columns.str.strip().str.lower()
data.columns

Index(['laptop_id', 'company', 'product', 'typename', 'inches',
       'screenresolution', 'cpu', 'ram', 'memory', 'gpu', 'opsys', 'weight',
       'price_euros'],
      dtype='object')

In [5]:
# Replace inconsistent column names
data = data.rename(columns={"screenresolution": "screen_resolution", "opsys": "operating_system"})
data.columns

Index(['laptop_id', 'company', 'product', 'typename', 'inches',
       'screen_resolution', 'cpu', 'ram', 'memory', 'gpu', 'operating_system',
       'weight', 'price_euros'],
      dtype='object')

In [6]:
# Identify categorical columns
categorical_col = data.select_dtypes(include="object")
categorical_col

Unnamed: 0,company,product,typename,screen_resolution,cpu,ram,memory,gpu,operating_system,weight
0,Apple,MacBook Pro,Ultrabook,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg
1,Apple,Macbook Air,Ultrabook,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg
2,HP,250 G6,Notebook,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg
3,Apple,MacBook Pro,Ultrabook,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg
4,Apple,MacBook Pro,Ultrabook,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg
...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg


In [7]:
# Identify numerical columns
numerical_col = data.select_dtypes(include=["int64", "float64"])
numerical_col

Unnamed: 0,laptop_id,inches,price_euros
0,1,13.3,1339.69
1,2,13.3,898.94
2,3,15.6,575.00
3,4,15.4,2537.45
4,5,13.3,1803.60
...,...,...,...
1298,1316,14.0,638.00
1299,1317,13.3,1499.00
1300,1318,14.0,229.00
1301,1319,15.6,764.00


In [None]:

# Remove "GB" from ram and convert to int
data["ram"] = data["ram"].astype(str).str.replace("GB", "").astype(int)

# Remove "kg" from weight and convert to float
data["weight"] = data["weight"].astype(str).str.replace("kg", "").astype(float)



In [9]:
# Flag special keywords
data["Touchscreen"] = data["screen_resolution"].str.contains("Touchscreen", case=False, na=False).astype(int)
data["IPS"] = data["screen_resolution"].str.contains("IPS", case=False, na=False).astype(int)
data["Retina"] = data["screen_resolution"].str.contains("Retina", case=False, na=False).astype(int)

In [10]:
# Clean up memory
data["memory"] = data["memory"].str.replace("TB", "1000GB")
data["memory"] = data["memory"].str.replace("GB", "")


# Initialize new columns
data["HDD"] = 0
data["SSD"] = 0
data["Hybrid"] = 0
data["Flash_Storage"] = 0

for i, row in data.iterrows():
    mem_parts = row["memory"].split("+")
    for parts in mem_parts:
        if "HDD" in parts:
            value = ''.join(filter(str.isdigit, parts.replace("HDD", "")))
            data.at[i, "HDD"] = int(value) if value else 0

        elif "SSD" in parts:
            value = ''.join(filter(str.isdigit, parts.replace("SSD", "")))
            data.at[i, "SSD"] = int(value) if value else 0

        elif "Hybrid" in parts:
            value = ''.join(filter(str.isdigit, parts.replace("Hybrid", "")))
            data.at[i, "Hybrid"] = int(value) if value else 0

        elif "Flash" in parts:
            value = ''.join(filter(str.isdigit, parts.replace("Flash", "")))
            data.at[i, "Flash_Storage"] = int(value) if value else 0

data = data.drop("memory", axis=1)


In [11]:
# 1. Extract resolution width and height (if present)
data["Resolution_Width"] = data["screen_resolution"].str.extract(r'(\d+)x')[0]
data["Resolution_Height"] = data["screen_resolution"].str.extract(r'x(\d+)')[0]

# Convert to numeric
data["Resolution_Width"] = pd.to_numeric(data["Resolution_Width"], errors="coerce")
data["Resolution_Height"] = pd.to_numeric(data["Resolution_Height"], errors="coerce")

# 2. Handle descriptive terms (map them to common resolutions)
def map_resolution(text):
    text = text.lower()
    if "full hd" in text:
        return (1920, 1080)
    elif "retina" in text:  # Apple Retina HD usually ~ 2560x1600
        return (2560, 1600)
    elif "quad hd" in text or "qhd" in text:
        return (2560, 1440)
    elif "4k" in text or "ultra hd" in text:
        return (3840, 2160)
    elif "hd" in text and pd.isna(width):
        return (1366, 768)  # fallback HD
    else:
        return (np.nan, np.nan)

# Fill missing numeric resolutions using text mapping
for i, row in data.iterrows():
    if pd.isna(row["Resolution_Width"]) or pd.isna(row["Resolution_Height"]):
        w, h = map_resolution(row["creen_resolution"], row["Resolution_Width"])
        data.at[i, "resolution_Width"] = w
        data.at[i, "resolution_Height"] = h

# 3. Flags for extra features
data["Touchscreen"] = data["screen_resolution"].str.contains("touch", case=False, na=False).astype(int)
data["IPS"] = data["screen_resolution"].str.contains("ips", case=False, na=False).astype(int)
data["Retina"] = data["screen_resolution"].str.contains("retina", case=False, na=False).astype(int)
data["FullHD_Flag"] = data["screen_resolution"].str.contains("full hd", case=False, na=False).astype(int)

# 4. (Optional) Drop original column
data = data.drop("screen_resolution", axis=1)


In [12]:
data

Unnamed: 0,laptop_id,company,product,typename,inches,cpu,ram,gpu,operating_system,weight,...,Touchscreen,IPS,Retina,HDD,SSD,Hybrid,Flash_Storage,Resolution_Width,Resolution_Height,FullHD_Flag
0,1,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,...,0,1,1,0,128,0,0,2560,1600,0
1,2,Apple,Macbook Air,Ultrabook,13.3,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,...,0,0,0,0,0,0,128,1440,900,0
2,3,HP,250 G6,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,...,0,0,0,0,256,0,0,1920,1080,1
3,4,Apple,MacBook Pro,Ultrabook,15.4,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,...,0,1,1,0,512,0,0,2880,1800,0
4,5,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,...,0,1,1,0,256,0,0,2560,1600,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,Intel Core i7 6500U 2.5GHz,4,Intel HD Graphics 520,Windows 10,1.80,...,1,1,0,0,128,0,0,1920,1080,1
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,Intel Core i7 6500U 2.5GHz,16,Intel HD Graphics 520,Windows 10,1.30,...,1,1,0,0,512,0,0,3200,1800,0
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,Intel Celeron Dual Core N3050 1.6GHz,2,Intel HD Graphics,Windows 10,1.50,...,0,0,0,0,0,0,64,1366,768,0
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,Intel Core i7 6500U 2.5GHz,6,AMD Radeon R5 M330,Windows 10,2.19,...,0,0,0,11000,0,0,0,1366,768,0


In [13]:

# CPU cleaning
data["cpu_brand"] = data["cpu"].apply(lambda x: x.split()[0])

def extract_cpu_family(cpu_name):
    if "Intel Core i3" in cpu_name:
        return "Intel Core i3"
    elif "Intel Core i5" in cpu_name:
        return "Intel Core i5"
    elif "Intel Core i7" in cpu_name:
        return "Intel Core i7"
    elif "Intel Core i9" in cpu_name:
        return "Intel Core i9"
    elif "AMD Ryzen 3" in cpu_name:
        return "AMD Ryzen 3"
    elif "AMD Ryzen 5" in cpu_name:
        return "AMD Ryzen 5"
    elif "AMD Ryzen 7" in cpu_name:
        return "AMD Ryzen 7"
    elif "AMD Ryzen 9" in cpu_name:
        return "AMD Ryzen 9"
    else:
        return "Other"

data["CPU_Family"] = data["cpu"].apply(extract_cpu_family)

def extract_intel_gen(cpu_name):
    if "Intel" in cpu_name:
        match = re.search(r"\d{4}", cpu_name)
        if match:
            return int(str(match.group(0))[0])
    return np.nan

data["CPU_Gen"] = data["cpu"].apply(extract_intel_gen)

# Drop original CPU if desired
data = data.drop("cpu", axis=1)
# GPU Cleaning
data["GPU_Brand"] = data["gpu"].apply(lambda x: x.split()[0])
data["GPU_Integrated"] = data["gpu"].apply(lambda x: 1 if "Intel" in x else 0)

# Drop original GPU if desired
data = data.drop("gpu", axis=1)

# ===============================
# 7. Operating System
# ===============================
def simplify_os(os):
    os = str(os).lower()
    if "mac" in os:
        return "MacOS"
    elif "windows" in os:
        return "Windows"
    elif "linux" in os:
        return "Linux"
    else:
        return "Other"

data["operating_system"] = data["operating_system"].apply(simplify_os)
data["CPU_Gen"] = data["CPU_Gen"].fillna(0).astype(int)



In [14]:
data

Unnamed: 0,laptop_id,company,product,typename,inches,ram,operating_system,weight,price_euros,Touchscreen,...,Hybrid,Flash_Storage,Resolution_Width,Resolution_Height,FullHD_Flag,cpu_brand,CPU_Family,CPU_Gen,GPU_Brand,GPU_Integrated
0,1,Apple,MacBook Pro,Ultrabook,13.3,8,MacOS,1.37,1339.69,0,...,0,0,2560,1600,0,Intel,Intel Core i5,0,Intel,1
1,2,Apple,Macbook Air,Ultrabook,13.3,8,MacOS,1.34,898.94,0,...,0,128,1440,900,0,Intel,Intel Core i5,0,Intel,1
2,3,HP,250 G6,Notebook,15.6,8,Other,1.86,575.00,0,...,0,0,1920,1080,1,Intel,Intel Core i5,7,Intel,1
3,4,Apple,MacBook Pro,Ultrabook,15.4,16,MacOS,1.83,2537.45,0,...,0,0,2880,1800,0,Intel,Intel Core i7,0,AMD,0
4,5,Apple,MacBook Pro,Ultrabook,13.3,8,MacOS,1.37,1803.60,0,...,0,0,2560,1600,0,Intel,Intel Core i5,0,Intel,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,4,Windows,1.80,638.00,1,...,0,0,1920,1080,1,Intel,Intel Core i7,6,Intel,1
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,16,Windows,1.30,1499.00,1,...,0,0,3200,1800,0,Intel,Intel Core i7,6,Intel,1
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,2,Windows,1.50,229.00,0,...,0,64,1366,768,0,Intel,Other,3,Intel,1
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,6,Windows,2.19,764.00,0,...,0,0,1366,768,0,Intel,Intel Core i7,6,AMD,0


In [25]:
# Split data into training and test.
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=30
)

In [26]:
print(x_train.shape, y_train.shape)

(912, 22) (912,)


In [27]:
# Model Pipline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regression", RandomForestRegressor(random_state=30))
])

# Train
model.fit(x_train, y_train)


ValueError: could not convert string to float: 'Windows'