<a href="https://colab.research.google.com/github/scientistEgong/Laptop-Price-Prediction-Model/blob/main/Features_Engineering_Robert_William_23EGCO023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Feature Engineering**
Robert Williams



In [None]:
# Import necessary libraries for data manipulation, visualization, and machine learning.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

In [None]:
# Load the dataset into a pandas DataFrame. The 'cp1252' encoding is specified
# to handle potential character errors in the CSV file.
df = pd.read_csv("Datasets/laptop_price.csv" , encoding= 'cp1252')

#  Display the first few rows to get an initial look at the data structure.
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [None]:
# Display a concise summary of the DataFrame, including the data types of each
# column and the number of non-null values. This is crucial for identifying
# missing values and planning the next cleaning steps.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [None]:
# One-hot encode the 'Company' column to convert categorical data into a numerical
# format suitable for machine learning models. Each unique company will become
# a new binary column (0 or 1).
company_dummies = pd.get_dummies(df['Company'], prefix='Company', dtype=int)
df = pd.concat([df, company_dummies], axis=1) # Concatenate the new dummy columns with the main DataFrame.
df.drop('Company', axis=1, inplace=True) # Drop the original 'Company' column to avoid multicollinearity.
display(df.head())

Unnamed: 0,laptop_ID,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,...,Company_LG,Company_Lenovo,Company_MSI,Company_Mediacom,Company_Microsoft,Company_Razer,Company_Samsung,Company_Toshiba,Company_Vero,Company_Xiaomi
0,1,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,...,0,0,0,0,0,0,0,0,0,0
1,2,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,...,0,0,0,0,0,0,0,0,0,0
2,3,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,...,0,0,0,0,0,0,0,0,0,0
3,4,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,...,0,0,0,0,0,0,0,0,0,0
4,5,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Perform the same one-hot encoding process for the 'TypeName' column.
typename_dummies = pd.get_dummies(df['TypeName'], prefix='TypeName', dtype=int)
df = pd.concat([df, typename_dummies], axis=1)
df.drop('TypeName', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Product,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,...,Company_Samsung,Company_Toshiba,Company_Vero,Company_Xiaomi,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation
0,1,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,...,0,0,0,0,0,0,0,0,1,0
1,2,Macbook Air,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,...,0,0,0,0,0,0,0,0,1,0
2,3,250 G6,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,...,0,0,0,0,0,0,0,1,0,0
3,4,MacBook Pro,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,...,0,0,0,0,0,0,0,0,1,0
4,5,MacBook Pro,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,...,0,0,0,0,0,0,0,0,1,0


In [None]:
# Drop the 'Product' column. This is an important step to prevent
# high cardinality, which can lead to overfitting and poor model performance.
# Each laptop has a unique product name, so it's not a useful feature for generalization.
df.drop('Product', axis=1, inplace=True)

In [None]:
# Feature engineering on the 'ScreenResolution' column. This block extracts
# boolean information about 'Retina' and 'HD' displays, creating new features
# from a text-based column. This is a great way to add predictive power.
# One-hot encode based on specific screen resolution features
df['ScreenResolution_Retina'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS Panel Retina Display' in x else 0)
df['ScreenResolution_HD'] = df['ScreenResolution'].apply(lambda x: 1 if 'HD' in x else 0)
df['ScreenResolution_Other'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS Panel Retina Display' not in x and 'HD' not in x else 0)

display(df.head())

Unnamed: 0,laptop_ID,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,...,Company_Xiaomi,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other
0,1,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,...,0,0,0,0,0,1,0,1,0,0
1,2,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,...,0,0,0,0,0,1,0,0,0,1
2,3,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,...,0,0,0,0,1,0,0,0,1,0
3,4,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,...,0,0,0,0,0,1,0,1,0,0
4,5,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,...,0,0,0,0,0,1,0,1,0,0


In [None]:
# Drop the original 'ScreenResolution' column after extracting useful features.
df.drop('ScreenResolution', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Company_Acer,...,Company_Xiaomi,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other
0,1,13.3,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,...,0,0,0,0,0,1,0,1,0,0
1,2,13.3,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,...,0,0,0,0,0,1,0,0,0,1
2,3,15.6,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,0,...,0,0,0,0,1,0,0,0,1,0
3,4,15.4,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,...,0,0,0,0,0,1,0,1,0,0
4,5,13.3,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,...,0,0,0,0,0,1,0,1,0,0


In [None]:
# Clean the 'Ram' column by removing the 'GB' suffix and converting the values
# to integers, which are necessary for numerical calculations.
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Company_Acer,...,Company_Xiaomi,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other
0,1,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,...,0,0,0,0,0,1,0,1,0,0
1,2,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,...,0,0,0,0,0,1,0,0,0,1
2,3,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,0,...,0,0,0,0,1,0,0,0,1,0
3,4,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,...,0,0,0,0,0,1,0,1,0,0
4,5,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,...,0,0,0,0,0,1,0,1,0,0


In [None]:
# Import the regular expressions library.
import re

# Define a function to extract the CPU speed using a regular expression.
# The regex `(\d+\.?\d*)GHz` looks for a number (with or without a decimal)
# followed by 'GHz'.
def extract_cpu_speed(cpu_string):
    speed_match = re.search(r'(\d+\.?\d*)GHz', cpu_string)
    if speed_match:
        return speed_match.group(1)
    return None

# Apply the function to the 'Cpu' column and convert the resulting values to numeric.
df['cpu_speed'] = df['Cpu'].apply(extract_cpu_speed)
df['cpu_speed'] = pd.to_numeric(df['cpu_speed'])

display(df.head())

Unnamed: 0,laptop_ID,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Company_Acer,...,TypeName_2 in 1 Convertible,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed
0,1,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,...,0,0,0,0,1,0,1,0,0,2.3
1,2,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,...,0,0,0,0,1,0,0,0,1,1.8
2,3,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,0,...,0,0,0,1,0,0,0,1,0,2.5
3,4,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,...,0,0,0,0,1,0,1,0,0,2.7
4,5,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,...,0,0,0,0,1,0,1,0,0,3.1


In [None]:
# Improved Memory Extraction
# This block demonstrates a cleaner, more direct approach to handling multiple
# replacements and extractions on the 'Memory' column.
# The old 'extract_memory_info' function is not needed if you use this approach.
df['Memory'] = df['Memory'].str.replace('GB','').str.replace('TB','000').str.replace('Flash Storage','SSD').str.replace('Hybrid','HDD').str.replace(' ', '')
df['Memory_ssd'] = df['Memory'].str.contains('SSD').astype(int)
df['Memory_hdd'] = df['Memory'].str.contains('HDD').astype(int)
df['Memory_size'] = df['Memory'].str.extract('(\d+)').astype(float)
# You can then clean and aggregate these new features as needed.

  df['Memory_size'] = df['Memory'].str.extract('(\d+)').astype(float)


In [None]:
def extract_memory_info(memory_string):
    size = 0
    memory_type = None

    if 'GB' in memory_string:
        size_match = re.search(r'(\d+)GB', memory_string)
        if size_match:
            size = int(size_match.group(1))
    elif 'TB' in memory_string:
        size_match = re.search(r'(\d+)TB', memory_string)
        if size_match:
            size = int(size_match.group(1)) * 1024 # Convert TB to GB

    if 'SSD' in memory_string:
        memory_type = 'SSD'
    elif 'HDD' in memory_string:
        memory_type = 'HDD'
    elif 'Flash Storage' in memory_string:
        memory_type = 'Flash Storage'
    elif 'Hybrid' in memory_string:
        memory_type = 'Hybrid'

    return size, memory_type

df[['memory_size_gb', 'memory_type']] = df['Memory'].apply(lambda x: pd.Series(extract_memory_info(x)))
df['memory_size_gb'] = pd.to_numeric(df['memory_size_gb'])

display(df.head())

Unnamed: 0,laptop_ID,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Company_Acer,...,TypeName_Workstation,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,memory_type
0,1,13.3,Intel Core i5 2.3GHz,8,128SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,...,0,1,0,0,2.3,1,0,128.0,0,SSD
1,2,13.3,Intel Core i5 1.8GHz,8,128SSD,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,...,0,0,0,1,1.8,1,0,128.0,0,SSD
2,3,15.6,Intel Core i5 7200U 2.5GHz,8,256SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,0,...,0,0,1,0,2.5,1,0,256.0,0,SSD
3,4,15.4,Intel Core i7 2.7GHz,16,512SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,...,0,1,0,0,2.7,1,0,512.0,0,SSD
4,5,13.3,Intel Core i5 3.1GHz,8,256SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,...,0,1,0,0,3.1,1,0,256.0,0,SSD


In [None]:
df['processing_speed'] = df['cpu_speed'] * df['Ram'] * df['memory_size_gb']
display(df[['cpu_speed', 'Ram', 'memory_size_gb', 'processing_speed']].head())

Unnamed: 0,cpu_speed,Ram,memory_size_gb,processing_speed
0,2.3,8,0,0.0
1,1.8,8,0,0.0
2,2.5,8,0,0.0
3,2.7,16,0,0.0
4,3.1,8,0,0.0


In [None]:
df.drop(['Cpu', 'Memory', 'Ram'], axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,ScreenResolution_Retina,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,memory_type,processing_speed
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,1,0,0,2.3,1,0,128.0,0,SSD,0.0
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,0,0,1,1.8,1,0,128.0,0,SSD,0.0
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,0,1,0,2.5,1,0,256.0,0,SSD,0.0
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,1,0,0,2.7,1,0,512.0,0,SSD,0.0
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,1,0,0,3.1,1,0,256.0,0,SSD,0.0


In [None]:
memory_type_dummies = pd.get_dummies(df['memory_type'], prefix='MemoryType', dtype=int)
df = pd.concat([df, memory_type_dummies], axis=1)
df.drop('memory_type', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,ScreenResolution_HD,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,0,0,2.3,1,0,128.0,0,0.0,0,1
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,0,1,1.8,1,0,128.0,0,0.0,0,1
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,1,0,2.5,1,0,256.0,0,0.0,0,1
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,0,0,2.7,1,0,512.0,0,0.0,0,1
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,0,0,3.1,1,0,256.0,0,0.0,0,1


In [None]:
# Improved GPU Company Extraction
df['gpu_company'] = df['Gpu'].apply(lambda x: x.split()[0])
display(df['gpu_company'].value_counts()) # Check for any misspellings or unwanted entries

Unnamed: 0_level_0,count
gpu_company,Unnamed: 1_level_1
Intel,722
Nvidia,400
AMD,180
ARM,1


In [None]:
def extract_gpu_company(gpu_string):
    company_match = re.match(r'(\w+)', gpu_string)
    if company_match:
        return company_match.group(1)
    return None

df['gpu_company'] = df['Gpu'].apply(extract_gpu_company)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,ScreenResolution_Other,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD,gpu_company
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,0,2.3,1,0,128.0,0,0.0,0,1,Intel
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,1,1.8,1,0,128.0,0,0.0,0,1,Intel
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,0,2.5,1,0,256.0,0,0.0,0,1,Intel
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,0,2.7,1,0,512.0,0,0.0,0,1,AMD
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,0,3.1,1,0,256.0,0,0.0,0,1,Intel


In [None]:
def extract_gpu_type(gpu_string):
    type_match = re.search(r'\s(\w+(?:\s\w+)?)\s', gpu_string)
    if type_match:
        return type_match.group(1)
    return None

df['gpu_type'] = df['Gpu'].apply(extract_gpu_type)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,cpu_speed,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD,gpu_company,gpu_type
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,2.3,1,0,128.0,0,0.0,0,1,Intel,Iris Plus
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,1.8,1,0,128.0,0,0.0,0,1,Intel,HD Graphics
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,2.5,1,0,256.0,0,0.0,0,1,Intel,HD Graphics
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,2.7,1,0,512.0,0,0.0,0,1,AMD,Radeon Pro
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,3.1,1,0,256.0,0,0.0,0,1,Intel,Iris Plus


In [None]:
def extract_gpu_details(gpu_string):
    # This regex looks for numbers or alphanumeric strings that are likely model details,
    # excluding the company and general type already extracted.
    details_match = re.search(r'(?:\d+|\w+-\w+)(?:\s\d+)?$', gpu_string)
    if details_match:
        return details_match.group(0)
    return None

df['gpu_details'] = df['Gpu'].apply(extract_gpu_details)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,Memory_ssd,Memory_hdd,Memory_size,memory_size_gb,processing_speed,MemoryType_HDD,MemoryType_SSD,gpu_company,gpu_type,gpu_details
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,1,0,128.0,0,0.0,0,1,Intel,Iris Plus,640
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,1,0,128.0,0,0.0,0,1,Intel,HD Graphics,6000
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,1,0,256.0,0,0.0,0,1,Intel,HD Graphics,620
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,1,0,512.0,0,0.0,0,1,AMD,Radeon Pro,455
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,1,0,256.0,0,0.0,0,1,Intel,Iris Plus,650


In [None]:
gpu_company_dummies = pd.get_dummies(df['gpu_company'], prefix='GPUCompany', dtype=int)
df = pd.concat([df, gpu_company_dummies], axis=1)
df.drop('gpu_company', axis=1, inplace=True)

gpu_type_dummies = pd.get_dummies(df['gpu_type'], prefix='GPUType', dtype=int)
df = pd.concat([df, gpu_type_dummies], axis=1)
df.drop('gpu_type', axis=1, inplace=True)

display(df.head())

Unnamed: 0,laptop_ID,Inches,Gpu,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,GPUType_R4,GPUType_Radeon,GPUType_Radeon Pro,GPUType_Radeon R2,GPUType_Radeon R4,GPUType_Radeon R5,GPUType_Radeon R7,GPUType_Radeon R9,GPUType_Radeon RX,GPUType_UHD Graphics
0,1,13.3,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,13.3,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,15.6,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,15.4,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,5,13.3,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.drop('Gpu', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Inches,OpSys,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,...,GPUType_R4,GPUType_Radeon,GPUType_Radeon Pro,GPUType_Radeon R2,GPUType_Radeon R4,GPUType_Radeon R5,GPUType_Radeon R7,GPUType_Radeon R9,GPUType_Radeon RX,GPUType_UHD Graphics
0,1,13.3,macOS,1.37kg,1339.69,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,13.3,macOS,1.34kg,898.94,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,15.6,No OS,1.86kg,575.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,15.4,macOS,1.83kg,2537.45,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,5,13.3,macOS,1.37kg,1803.6,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
opsys_dummies = pd.get_dummies(df['OpSys'], prefix='OpSys', dtype=int)
df = pd.concat([df, opsys_dummies], axis=1)
df.drop('OpSys', axis=1, inplace=True)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,Company_Fujitsu,...,GPUType_UHD Graphics,OpSys_Android,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
0,1,13.3,1.37kg,1339.69,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,13.3,1.34kg,898.94,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,15.6,1.86kg,575.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,15.4,1.83kg,2537.45,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,13.3,1.37kg,1803.6,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   laptop_ID                    1303 non-null   int64  
 1   Inches                       1303 non-null   float64
 2   Weight                       1303 non-null   object 
 3   Price_euros                  1303 non-null   float64
 4   Company_Acer                 1303 non-null   int64  
 5   Company_Apple                1303 non-null   int64  
 6   Company_Asus                 1303 non-null   int64  
 7   Company_Chuwi                1303 non-null   int64  
 8   Company_Dell                 1303 non-null   int64  
 9   Company_Fujitsu              1303 non-null   int64  
 10  Company_Google               1303 non-null   int64  
 11  Company_HP                   1303 non-null   int64  
 12  Company_Huawei               1303 non-null   int64  
 13  Company_LG        

In [None]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
display(df.head())

Unnamed: 0,laptop_ID,Inches,Weight,Price_euros,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,Company_Fujitsu,...,GPUType_UHD Graphics,OpSys_Android,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
0,1,13.3,1.37,1339.69,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,13.3,1.34,898.94,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,15.6,1.86,575.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,15.4,1.83,2537.45,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,13.3,1.37,1803.6,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
print(df['gpu_details'].value_counts())

gpu_details
620      355
520      202
1050      66
1060      49
530       42
500       39
400       37
1070      30
430       22
150       15
515       15
445       14
615       14
505       12
5         11
1200       9
405        9
640        8
420        8
330        7
1080       7
540        6
130        6
6000       5
2          5
550        5
440        5
580        5
510        4
630        4
4          4
560        2
650        2
5300       2
460        2
960        2
2200       2
455        1
555        1
5 430      1
M1-70      1
7          1
5 520      1
465        1
920        1
3          1
385        1
315        1
360        1
Name: count, dtype: int64


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   laptop_ID                    1303 non-null   int64  
 1   Inches                       1303 non-null   float64
 2   Weight                       1303 non-null   float64
 3   Price_euros                  1303 non-null   float64
 4   Company_Acer                 1303 non-null   int64  
 5   Company_Apple                1303 non-null   int64  
 6   Company_Asus                 1303 non-null   int64  
 7   Company_Chuwi                1303 non-null   int64  
 8   Company_Dell                 1303 non-null   int64  
 9   Company_Fujitsu              1303 non-null   int64  
 10  Company_Google               1303 non-null   int64  
 11  Company_HP                   1303 non-null   int64  
 12  Company_Huawei               1303 non-null   int64  
 13  Company_LG        