In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts


In [2]:
data = pd.read_excel("../data_source/laptop_data.xlsx")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


In [3]:
data.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
data['Ram'] = data['Ram'].str.replace('GB','').astype('int32')
data['Weight'] = data['Weight'].str.replace('kg','').astype('float32')
data['Price'] = data['Price'].astype(int)


In [4]:
data['Touchscreen'] = data['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
data['IPS'] = data['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)


In [5]:
new = data['ScreenResolution'].str.split('x', n=1, expand=True)
data['x_res'] = new[0]
data['y_res'] = new[1]


In [6]:
data['x_res'] = data['x_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0] if x else '0')

data['x_res'] = data['x_res'].astype(int)
data['y_res'] = data['y_res'].astype(int)


In [7]:
data['ppi'] = (((data['x_res']**2) + (data['y_res']**2))**0.5 / data['Inches']).astype(float)
data.drop(columns=['ScreenResolution','Inches','x_res','y_res'], inplace=True)


In [8]:
# CPU processing
data['Cpu Name'] = data['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))


In [9]:

def standardize_cpu_names(text):
    if text == 'Intel Core i5' or text == 'Intel Core i7' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'


In [10]:
data['Cpu brand'] = data['Cpu Name'].apply(standardize_cpu_names)
data.drop(columns=['Cpu','Cpu Name'], inplace=True)


In [11]:
data['Memory'] = data['Memory'].astype(str).replace('\.0', '', regex=True)
data["Memory"] = data["Memory"].str.replace('GB', '')
data["Memory"] = data["Memory"].str.replace('TB', '000')
new = data["Memory"].str.split("+", n=1, expand=True)


  data['Memory'] = data['Memory'].astype(str).replace('\.0', '', regex=True)


In [12]:

data["first"] = new[0]
data["first"] = data["first"].str.strip()

data["second"] = new[1]


In [13]:
# Create storage type indicators
data["Layer1HDD"] = data["first"].apply(lambda x: 1 if "HDD" in x else 0)
data["Layer1SSD"] = data["first"].apply(lambda x: 1 if "SSD" in x else 0)
data["Layer1Hybrid"] = data["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
data["Layer1Flash_Storage"] = data["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)


In [14]:
# Extract numeric values only
data['first'] = data['first'].str.extract('(\d+)', expand=False)
data['first'].fillna('0', inplace=True)

data["second"].fillna("0", inplace=True)


  data['first'] = data['first'].str.extract('(\d+)', expand=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['first'].fillna('0', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["second"].fillna("0", inplace=True)


In [15]:
data["Layer2HDD"] = data["second"].apply(lambda x: 1 if "HDD" in x else 0)
data["Layer2SSD"] = data["second"].apply(lambda x: 1 if "SSD" in x else 0)
data["Layer2Hybrid"] = data["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
data["Layer2Flash_Storage"] = data["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)



In [16]:
data['second'] = data['second'].str.extract('(\d+)', expand=False)
data['second'].fillna('0', inplace=True)


  data['second'] = data['second'].str.extract('(\d+)', expand=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['second'].fillna('0', inplace=True)


In [17]:
data["first"] = data["first"].astype(int)
data["second"] = data["second"].astype(int)


In [18]:
data["HDD"] = (data["first"] * data["Layer1HDD"] + data["second"] * data["Layer2HDD"])
data["SSD"] = (data["first"] * data["Layer1SSD"] + data["second"] * data["Layer2SSD"])
data["Hybrid"] = (data["first"] * data["Layer1Hybrid"] + data["second"] * data["Layer2Hybrid"])
data["Flash_Storage"] = (data["first"] * data["Layer1Flash_Storage"] + data["second"] * data["Layer2Flash_Storage"])


In [19]:
data.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
                   'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
                   'Layer2Flash_Storage', 'Memory', 'Hybrid', 'Flash_Storage'], inplace=True)


In [20]:
data['Gpu brand'] = data['Gpu'].apply(lambda x: x.split()[0])
data = data[data['Gpu brand'] != 'ARM']
data.drop(columns=['Gpu'], inplace=True)


In [21]:
def map_operating_system(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Other/Linux/No OS'


In [22]:
data['os'] = data['OpSys'].apply(map_operating_system)
data.drop(columns=['OpSys'], inplace=True)


In [23]:
data.to_csv("../data_source/laptop_data_preprocessed.csv", index=False)
