<a href="https://colab.research.google.com/github/scientistEgong/Laptop-Price-Prediction-Model/blob/main/Features_Engineering_Robert_William_23EGCO023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Feature Engineering**
Robert Williams



In [60]:
# Import necessary libraries for data manipulation, visualization, and machine learning.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

In [61]:
# Load the dataset into a pandas DataFrame. The 'cp1252' encoding is specified
# to handle potential character errors in the CSV file.
df = pd.read_csv("Datasets/cleaned_laptop_price_data.csv" , encoding= 'cp1252')

#  Display the first few rows to get an initial look at the data structure.
df.head()

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,screen_resolution,device_processor (CPU),device_memory (RAM),memory_capacity,gpu,operating_system,weight,price_euros
0,1,1,300,4,13.3,23,65,8,4,58,8,38,1339.69
1,2,1,301,4,13.3,1,63,8,2,51,8,35,898.94
2,3,7,50,3,15.6,8,74,8,16,53,4,74,575.0
3,4,1,300,4,15.4,25,85,1,29,9,8,71,2537.45
4,5,1,300,4,13.3,23,67,8,16,59,8,38,1803.6


In [62]:
df.columns

Index(['laptop_id', 'manufacturers_info', 'product', 'laptop_type', 'inches',
       'screen_resolution', 'device_processor (CPU)', 'device_memory (RAM)',
       'memory_capacity', 'gpu', 'operating_system', 'weight', 'price_euros'],
      dtype='object')

In [63]:
# Feature engineering on the 'ScreenResolution' column. This block extracts
# boolean information about 'Retina' and 'HD' displays, creating new features
# from a text-based column. This is a great way to add predictive power.
# One-hot encode based on specific screen resolution features
df['screen_resolution'] = df['screen_resolution'].astype(str)

df['ScreenResolution_Retina'] = df['screen_resolution'].apply(lambda x: 1 if 'IPS Panel Retina Display' in x else 0)
df['ScreenResolution_HD'] = df['screen_resolution'].apply(lambda x: 1 if 'HD' in x else 0)
df['ScreenResolution_Other'] = df['screen_resolution'].apply(lambda x: 1 if 'IPS Panel Retina Display' not in x and 'HD' not in x else 0)


display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,screen_resolution,device_processor (CPU),device_memory (RAM),memory_capacity,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other
0,1,1,300,4,13.3,23,65,8,4,58,8,38,1339.69,0,0,1
1,2,1,301,4,13.3,1,63,8,2,51,8,35,898.94,0,0,1
2,3,7,50,3,15.6,8,74,8,16,53,4,74,575.0,0,0,1
3,4,1,300,4,15.4,25,85,1,29,9,8,71,2537.45,0,0,1
4,5,1,300,4,13.3,23,67,8,16,59,8,38,1803.6,0,0,1


In [64]:
# Drop the original 'ScreenResolution' column after extracting useful features.
df.drop('screen_resolution', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,device_processor (CPU),device_memory (RAM),memory_capacity,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other
0,1,1,300,4,13.3,65,8,4,58,8,38,1339.69,0,0,1
1,2,1,301,4,13.3,63,8,2,51,8,35,898.94,0,0,1
2,3,7,50,3,15.6,74,8,16,53,4,74,575.0,0,0,1
3,4,1,300,4,15.4,85,1,29,9,8,71,2537.45,0,0,1
4,5,1,300,4,13.3,67,8,16,59,8,38,1803.6,0,0,1


In [65]:
# Import the regular expressions library.
import re

# Define a function to extract the CPU speed using a regular expression.
# The regex `(\d+\.?\d*)GHz` looks for a number (with or without a decimal)
# followed by 'GHz'.
def extract_cpu_speed(cpu_string):
    cpu_string = str(cpu_string)
    speed_match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
    if speed_match:
        return speed_match.group(1)
    return None

# Apply the function to the 'Cpu' column and convert the resulting values to numeric.
df['cpu_speed'] = df['device_processor (CPU)'].apply(extract_cpu_speed)

# Convert to numeric
df['cpu_speed'] = pd.to_numeric(df['cpu_speed'], errors='coerce')

display(df[['device_processor (CPU)', 'cpu_speed']].head())


Unnamed: 0,device_processor (CPU),cpu_speed
0,65,
1,63,
2,74,
3,85,
4,67,


In [66]:
# Improved Memory Extraction
# This block demonstrates a cleaner, more direct approach to handling multiple
# replacements and extractions on the 'Memory' column.
# The old 'extract_memory_info' function is not needed if you use this approach.
# Make sure the column is string before applying string methods
df['memory_capacity'] = df['memory_capacity'].astype(str)

# Clean memory column
df['memory_capacity'] = (
    df['memory_capacity']
    .str.replace('GB', '', regex=False)
    .str.replace('TB', '000', regex=False)
    .str.replace('Flash Storage', 'SSD', regex=False)
    .str.replace('Hybrid', 'HDD', regex=False)
    .str.replace(' ', '', regex=False)
)

# Create binary features
df['Memory_ssd'] = df['memory_capacity'].str.contains('SSD', case=False, na=False).astype(int)
df['Memory_hdd'] = df['memory_capacity'].str.contains('HDD', case=False, na=False).astype(int)

# Extract numeric memory size
df['Memory_size'] = df['memory_capacity'].str.extract(r'(\d+)').astype(float)

# You can then clean and aggregate these new features as needed.

In [67]:
def extract_memory_info(memory_string):
    size = 0
    memory_type = None

    if 'GB' in memory_string:
        size_match = re.search(r'(\d+)GB', memory_string)
        if size_match:
            size = int(size_match.group(1))
    elif 'TB' in memory_string:
        size_match = re.search(r'(\d+)TB', memory_string)
        if size_match:
            size = int(size_match.group(1)) * 1024 # Convert TB to GB

    if 'SSD' in memory_string:
        memory_type = 'SSD'
    elif 'HDD' in memory_string:
        memory_type = 'HDD'
    elif 'Flash Storage' in memory_string:
        memory_type = 'Flash Storage'
    elif 'Hybrid' in memory_string:
        memory_type = 'Hybrid'

    return size, memory_type

df[['memory_size_gb', 'memory_type']] = df['memory_capacity'].apply(lambda x: pd.Series(extract_memory_info(x)))
df['memory_size_gb'] = pd.to_numeric(df['memory_size_gb'])

display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,device_processor (CPU),device_memory (RAM),memory_capacity,gpu,operating_system,...,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,memory_type
0,1,1,300,4,13.3,65,8,4,58,8,...,1339.69,0,0,1,,0,0,4.0,0.0,
1,2,1,301,4,13.3,63,8,2,51,8,...,898.94,0,0,1,,0,0,2.0,0.0,
2,3,7,50,3,15.6,74,8,16,53,4,...,575.0,0,0,1,,0,0,16.0,0.0,
3,4,1,300,4,15.4,85,1,29,9,8,...,2537.45,0,0,1,,0,0,29.0,0.0,
4,5,1,300,4,13.3,67,8,16,59,8,...,1803.6,0,0,1,,0,0,16.0,0.0,


In [68]:
df['processing_speed'] = df['cpu_speed'] * df['device_memory (RAM)'] * df['memory_size_gb']
display(df[['cpu_speed', 'device_memory (RAM)', 'memory_size_gb', 'processing_speed']].head())

Unnamed: 0,cpu_speed,device_memory (RAM),memory_size_gb,processing_speed
0,,8,0.0,
1,,8,0.0,
2,,8,0.0,
3,,1,0.0,
4,,8,0.0,


In [69]:
df.drop(['device_processor (CPU)', 'memory_capacity', 'device_memory (RAM)'], axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,memory_type,processing_speed
0,1,1,300,4,13.3,58,8,38,1339.69,0,0,1,,0,0,4.0,0.0,,
1,2,1,301,4,13.3,51,8,35,898.94,0,0,1,,0,0,2.0,0.0,,
2,3,7,50,3,15.6,53,4,74,575.0,0,0,1,,0,0,16.0,0.0,,
3,4,1,300,4,15.4,9,8,71,2537.45,0,0,1,,0,0,29.0,0.0,,
4,5,1,300,4,13.3,59,8,38,1803.6,0,0,1,,0,0,16.0,0.0,,


In [70]:
memory_type_dummies = pd.get_dummies(df['memory_type'], prefix='MemoryType', dtype=int)
df = pd.concat([df, memory_type_dummies], axis=1)
df.drop('memory_type', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed
0,1,1,300,4,13.3,58,8,38,1339.69,0,0,1,,0,0,4.0,0.0,
1,2,1,301,4,13.3,51,8,35,898.94,0,0,1,,0,0,2.0,0.0,
2,3,7,50,3,15.6,53,4,74,575.0,0,0,1,,0,0,16.0,0.0,
3,4,1,300,4,15.4,9,8,71,2537.45,0,0,1,,0,0,29.0,0.0,
4,5,1,300,4,13.3,59,8,38,1803.6,0,0,1,,0,0,16.0,0.0,


In [72]:
df.columns

Index(['laptop_id', 'manufacturers_info', 'product', 'laptop_type', 'inches',
       'gpu', 'operating_system', 'weight', 'price_euros',
       'ScreenResolution_Retina', 'ScreenResolution_HD',
       'ScreenResolution_Other', 'cpu_speed', 'Memory_ssd', 'Memory_hdd',
       'Memory_size', 'memory_size_gb', 'processing_speed'],
      dtype='object')

In [74]:
df['gpu_company'] = df['gpu'].astype(str).apply(lambda x: x.split()[0] if pd.notnull(x) else None)
display(df['gpu_company'].value_counts())


gpu_company
53     281
47     185
61      68
75      66
79      48
      ... 
106      1
27       1
105      1
73       1
38       1
Name: count, Length: 110, dtype: int64

In [None]:
def extract_gpu_company(gpu_string):
    company_match = re.match(r'(\w+)', gpu_string)
    if company_match:
        return company_match.group(1)
    return None

df['gpu_company'] = df['Gpu'].apply(extract_gpu_company)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD,gpu_company
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,0,2.3,1,0,128.0,0,0.0,0,1,Intel
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,1,1.8,1,0,128.0,0,0.0,0,1,Intel
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,0,2.5,1,0,256.0,0,0.0,0,1,Intel
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,0,2.7,1,0,512.0,0,0.0,0,1,AMD
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,0,3.1,1,0,256.0,0,0.0,0,1,Intel


In [77]:
def extract_gpu_type(x):
    x = str(x)
    type_match = re.search(r'(Integrated|Dedicated)', x, re.IGNORECASE)
    if type_match:
        return type_match.group(1).capitalize()
    return None

df['gpu_type'] = df['gpu'].apply(extract_gpu_type)
df.head()

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,gpu_company,gpu_type
0,1,1,300,4,13.3,58,8,38,1339.69,0,0,1,,0,0,4.0,0.0,,58,
1,2,1,301,4,13.3,51,8,35,898.94,0,0,1,,0,0,2.0,0.0,,51,
2,3,7,50,3,15.6,53,4,74,575.0,0,0,1,,0,0,16.0,0.0,,53,
3,4,1,300,4,15.4,9,8,71,2537.45,0,0,1,,0,0,29.0,0.0,,9,
4,5,1,300,4,13.3,59,8,38,1803.6,0,0,1,,0,0,16.0,0.0,,59,


In [None]:
def extract_gpu_details(gpu_string):
    # This regex looks for numbers or alphanumeric strings that are likely model details,
    # excluding the company and general type already extracted.
    details_match = re.search(r'(?:\d+|\w+-\w+)(?:\s\d+)?$', gpu_string)
    if details_match:
        return details_match.group(0)
    return None

df['gpu_details'] = df['gpu'].apply(extract_gpu_details)
df.head()

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD,gpu_company,gpu_type,gpu_details
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,1,0,128.0,0,0.0,0,1,Intel,Iris Plus,640
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,1,0,128.0,0,0.0,0,1,Intel,HD Graphics,6000
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,1,0,256.0,0,0.0,0,1,Intel,HD Graphics,620
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,1,0,512.0,0,0.0,0,1,AMD,Radeon Pro,455
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,1,0,256.0,0,0.0,0,1,Intel,Iris Plus,650


In [78]:
gpu_company_dummies = pd.get_dummies(df['gpu_company'], prefix='GPUCompany', dtype=int)
df = pd.concat([df, gpu_company_dummies], axis=1)
df.drop('gpu_company', axis=1, inplace=True)

gpu_type_dummies = pd.get_dummies(df['gpu_type'], prefix='GPUType', dtype=int)
df = pd.concat([df, gpu_type_dummies], axis=1)
df.drop('gpu_type', axis=1, inplace=True)

df.head()

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,gpu,operating_system,weight,price_euros,ScreenResolution_Retina,...,GPUCompany_90,GPUCompany_91,GPUCompany_92,GPUCompany_93,GPUCompany_94,GPUCompany_95,GPUCompany_96,GPUCompany_97,GPUCompany_98,GPUCompany_99
0,1,1,300,4,13.3,58,8,38,1339.69,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,301,4,13.3,51,8,35,898.94,0,...,0,0,0,0,0,0,0,0,0,0
2,3,7,50,3,15.6,53,4,74,575.0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,300,4,15.4,9,8,71,2537.45,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,300,4,13.3,59,8,38,1803.6,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
df.drop('gpu', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_id,manufacturers_info,product,laptop_type,inches,operating_system,weight,price_euros,ScreenResolution_Retina,ScreenResolution_HD,...,GPUCompany_90,GPUCompany_91,GPUCompany_92,GPUCompany_93,GPUCompany_94,GPUCompany_95,GPUCompany_96,GPUCompany_97,GPUCompany_98,GPUCompany_99
0,1,1,300,4,13.3,8,38,1339.69,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,301,4,13.3,8,35,898.94,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,7,50,3,15.6,4,74,575.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,300,4,15.4,8,71,2537.45,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,300,4,13.3,8,38,1803.6,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
df.columns

Index(['laptop_id', 'manufacturers_info', 'product', 'laptop_type', 'inches',
       'operating_system', 'weight', 'price_euros', 'ScreenResolution_Retina',
       'ScreenResolution_HD',
       ...
       'GPUCompany_90', 'GPUCompany_91', 'GPUCompany_92', 'GPUCompany_93',
       'GPUCompany_94', 'GPUCompany_95', 'GPUCompany_96', 'GPUCompany_97',
       'GPUCompany_98', 'GPUCompany_99'],
      dtype='object', length=127)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   laptop_ID                    1303 non-null   int64  
 1   Inches                       1303 non-null   float64
 2   Weight                       1303 non-null   object 
 3   Price_euros                  1303 non-null   float64
 4   Company_Acer                 1303 non-null   int64  
 5   Company_Apple                1303 non-null   int64  
 6   Company_Asus                 1303 non-null   int64  
 7   Company_Chuwi                1303 non-null   int64  
 8   Company_Dell                 1303 non-null   int64  
 9   Company_Fujitsu              1303 non-null   int64  
 10  Company_Google               1303 non-null   int64  
 11  Company_HP                   1303 non-null   int64  
 12  Company_Huawei               1303 non-null   int64  
 13  Company_LG        

In [None]:
print(df['gpu_details'].value_counts())

gpu_details
620      355
520      202
1050      66
1060      49
530       42
500       39
400       37
1070      30
430       22
150       15
515       15
445       14
615       14
505       12
5         11
1200       9
405        9
640        8
420        8
330        7
1080       7
540        6
130        6
6000       5
2          5
550        5
440        5
580        5
510        4
630        4
4          4
560        2
650        2
5300       2
460        2
960        2
2200       2
455        1
555        1
5 430      1
M1-70      1
7          1
5 520      1
465        1
920        1
3          1
385        1
315        1
360        1
Name: count, dtype: int64


In [83]:
# Save engineered features
df.to_csv("Datasets/engineered_laptop_features.csv", index=False)