In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [16]:
# from google.colab import drive
# drive.mount('/content/drive')

In [17]:
# data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/5243final/laptopData_CLEAN.csv')

In [18]:
data = pd.read_csv('laptopData_CLEAN_RemoveOutliers.csv')

In [19]:
print(data.isnull().sum())

Company          0
TypeName         0
Ram              0
OpSys            0
Weight           0
Price            0
PPI              0
Cpu processor    0
Gpu_brand        0
HDD              0
SSD              0
Hybrid           0
Flash_Storage    0
dtype: int64


In [20]:
print(data.columns)

Index(['Company', 'TypeName', 'Ram', 'OpSys', 'Weight', 'Price', 'PPI',
       'Cpu processor', 'Gpu_brand', 'HDD', 'SSD', 'Hybrid', 'Flash_Storage'],
      dtype='object')


In [21]:
# Get CPU brand
data['CPU_brand'] = data['Cpu processor'].map(lambda x: x.split()[0] if isinstance(x, str) else np.nan)

In [22]:
# Get total memory with sum of 'HDD','SSD','Hybrid' and 'Flash_Storage'
data['Total_Memory_GB'] = data[['HDD', 'SSD', 'Hybrid', 'Flash_Storage']].sum(axis=1)

In [23]:
# Easily find most of Ram are 4,8,16,32, so suppose when >=16, Ram is high Ram
# Suppose High_Ram is 1, lower Ram is 0.
data['High_RAM'] = data['Ram'].apply(lambda x: 1 if x >= 16 else 0)



```
Feature: 'CPU_brand', 'Total_Memory_GB', 'High_RAM', 'GPU_brand', 'Company', 'TypeName', 'OpSys', 'PPI'.
```



In [24]:
# Feature Processing

In [None]:
numerical_features = ['Ram', 'Weight', 'PPI', 'Total_Memory_GB']
categorical_features = ['Company', 'TypeName', 'OpSys', 'Gpu_brand', 'CPU_brand','High_RAM']

In [26]:
transformers=[('num', StandardScaler(), numerical_features), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]
preprocessor = ColumnTransformer(transformers)

processed_array = preprocessor.fit_transform(data)

In [None]:
find_encoder = preprocessor.named_transformers_['cat']
encoded_cat_features = find_encoder.get_feature_names_out(categorical_features)
final_features = list(encoded_cat_features) + numerical_features
processed_data = pd.DataFrame(
    processed_array.toarray() if hasattr(processed_array, 'toarray') else processed_array,
    columns=final_features
)

# Add Price column back without any transformations
processed_data['Price'] = data['Price'].values



In [28]:
# processed_data.to_csv('/content/drive/MyDrive/Colab Notebooks/5243final/laptopData_PROCESSED.csv', index=False)
processed_data.to_csv('laptopData_PROCESSED.csv', index=False)

print("Feature engineering and preprocessing completed. Processed data saved as 'laptopData_PROCESSED.csv'.")


Feature engineering and preprocessing completed. Processed data saved as 'laptopData_PROCESSED.csv'.
