In [85]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/5243final/laptopData_CLEAN.csv')

In [88]:
print(data.isnull().sum())

Company          0
TypeName         0
Ram              0
OpSys            0
Weight           0
Price            0
PPI              0
Cpu processor    0
Gpu_brand        0
HDD              0
SSD              0
Hybrid           0
Flash_Storage    0
dtype: int64


In [89]:
print(data.columns)

Index(['Company', 'TypeName', 'Ram', 'OpSys', 'Weight', 'Price', 'PPI',
       'Cpu processor', 'Gpu_brand', 'HDD', 'SSD', 'Hybrid', 'Flash_Storage'],
      dtype='object')


In [90]:
# Get CPU brand
data['CPU_brand'] = data['Cpu processor'].map(lambda x: x.split()[0] if isinstance(x, str) else np.nan)

# print(data['CPU_brand'][5]) # test = "AMD"

In [91]:
# Get total memory with sum of 'HDD','SSD','Hybrid' and 'Flash_Storage'
data['Total_Memory_GB'] = data[['HDD', 'SSD', 'Hybrid', 'Flash_Storage']].sum(axis=1)

# print(data['Total_Memory_GB'][36]) # test = 1000 + 128 = 1128

In [92]:
# Easily find most of Ram are 4,8,16,32, so suppose when >=16, Ram is high Ram
# Suppose High_Ram is 1, lower Ram is 0.
data['High_RAM'] = data['Ram'].apply(lambda x: 1 if x >= 16 else 0)

# print(data['High_Ram'][36]) # test = 0



```
Feature: 'CPU_brand', 'Total_Memory_GB', 'High_RAM', 'GPU_brand', 'Company', 'TypeName', 'OpSys', 'PPI'.
```



In [93]:
# Feature Processing

In [94]:
numerical_features = ['Ram', 'Weight', 'Price', 'PPI', 'Total_Memory_GB']
categorical_features = ['Company', 'TypeName', 'OpSys', 'Gpu_brand', 'CPU_brand','High_RAM']

In [95]:
transformers=[('num', StandardScaler(), numerical_features), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]
preprocessor = ColumnTransformer(transformers)

processed_array = preprocessor.fit_transform(data)

In [96]:
find_encoder = preprocessor.named_transformers_['cat']
encoded_cat_features = find_encoder.get_feature_names_out(categorical_features)
final_features = list(encoded_cat_features) + numerical_features
processed_data = pd.DataFrame(
    processed_array.toarray() if hasattr(processed_array, 'toarray') else processed_array,
    columns=final_features)


In [97]:
processed_data.to_csv('/content/drive/MyDrive/Colab Notebooks/5243final/laptopData_PROCESSED.csv', index=False)

print("Feature engineering and preprocessing completed. Processed data saved as 'laptopData_PROCESSED.csv'.")


Feature engineering and preprocessing completed. Processed data saved as 'laptopData_PROCESSED.csv'.


In [98]:
test_a = data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/5243final/laptopData_PROCESSED.csv')
print(test_a.columns)

Index(['Company_Acer', 'Company_Apple', 'Company_Asus', 'Company_Dell',
       'Company_HP', 'Company_Lenovo', 'Company_MSI', 'Company_Other',
       'Company_Toshiba', 'TypeName_2 in 1 Convertible', 'TypeName_Gaming',
       'TypeName_Netbook', 'TypeName_Notebook', 'TypeName_Ultrabook',
       'TypeName_Workstation', 'OpSys_Mac', 'OpSys_Others/No OS/Linux',
       'OpSys_Windows', 'Gpu_brand_AMD', 'Gpu_brand_ARM', 'Gpu_brand_Intel',
       'Gpu_brand_Nvidia', 'CPU_brand_AMD', 'CPU_brand_Intel',
       'CPU_brand_Other', 'High_RAM_0', 'High_RAM_1', 'Ram', 'Weight', 'Price',
       'PPI', 'Total_Memory_GB'],
      dtype='object')
