In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
df = pd.read_csv("cleaned_croma_laptops.csv")

In [21]:
df.head()

Unnamed: 0,Brand,Screen Size,Color,RAM,Storage,GPU,CPU,Price
0,Apple,13.3,Gray,8,256.0,Integrated,M1,54490.0
1,Apple,13.3,Gold,8,256.0,Integrated,M1,54490.0
2,Apple,13.3,Silver,8,256.0,Integrated,M1,54490.0
3,Apple,14.2,Black,24,512.0,Integrated,M4 Pro,187990.0
4,Apple,13.6,Silver,16,256.0,Integrated,M4,93990.0


In [22]:
df.head()

Unnamed: 0,Brand,Screen Size,Color,RAM,Storage,GPU,CPU,Price
0,Apple,13.3,Gray,8,256.0,Integrated,M1,54490.0
1,Apple,13.3,Gold,8,256.0,Integrated,M1,54490.0
2,Apple,13.3,Silver,8,256.0,Integrated,M1,54490.0
3,Apple,14.2,Black,24,512.0,Integrated,M4 Pro,187990.0
4,Apple,13.6,Silver,16,256.0,Integrated,M4,93990.0


In [23]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Brand        473 non-null    object 
 1   Screen Size  473 non-null    float64
 2   Color        473 non-null    object 
 3   RAM          473 non-null    int64  
 4   Storage      473 non-null    float64
 5   GPU          473 non-null    object 
 6   CPU          473 non-null    object 
 7   Price        473 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 29.7+ KB


In [25]:
categorical_features = X.select_dtypes(include=["object"]).columns
numeric_features = X.select_dtypes(include=[np.number]).columns

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [27]:
model = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)

In [31]:
y_pred

array([ 61536.4009067 ,  76238.93980108,  81340.43260415, 127001.54229969,
        78040.09441479,  95530.34347255,  90816.29148146, 108238.16375641,
        43802.99306207,  97437.81057272,  60314.28031022,  59773.39672888,
        76314.06033365,  44297.06469718, 147494.04816685,  21823.13748065,
        90354.81532965, 170299.48790197,  46988.11179098,  44845.55785875,
        76028.60859922, 199081.04821461,  99919.82372467,  85334.27333414,
        78001.9439789 ,  85886.51677792,  93866.03515925,  68638.42992822,
        75690.44663951,  99481.97708758,  71576.09040097,  79887.77598613,
        77314.65467568, 205758.70985117,  28380.01478654,  84840.20169903,
        54664.01331558, 103956.31149041,  49947.05066181, 265689.65521323,
       126857.07937734,  35893.30011322, 142728.78717963, 208714.29276606,
       205758.70985117, 127313.03497004, 108238.16375641,  61039.72004933,
        48868.38375616,  44845.55785875, 159624.23470551,  69417.22342351,
       115836.8736353 , 1

In [32]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [33]:
print("Linear Regression")
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)

Linear Regression
MAE : 24809.563854249373
RMSE: 34733.52336928688
R²  : 0.7170423970185733


In [34]:
import pickle

In [35]:
with open("linear_laptop_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [36]:
import joblib

joblib.dump(model, "linear_model.joblib")


['linear_model.joblib']