# Model Training

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [13]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("processed_data/laptop_data_processed.csv")

In [5]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,HasIpsPanel,HasTouchScreen,ResWidth,ResHeight,ResCategory,Ppi,Ssd,Hdd,Flash,Hybrid,CpuCategory,CpuSpeedGhz,GpuCategory
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,1,0,2560,1600,High,226.98,128,0,0,0,Intel Core i5,2.3,Intel High-End
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,0,0,1440,900,Mid,127.68,0,0,128,0,Intel Core i5,1.8,Intel Low-End
2,HP,Notebook,15.6,8,No OS,1.86,30636,0,0,1920,1080,Mid,141.21,256,0,0,0,Intel Core i5,2.5,Intel Low-End
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,1,0,2880,1800,Ultra,220.53,512,0,0,0,Intel Core i7,2.7,AMD High-End
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,1,0,2560,1600,High,226.98,256,0,0,0,Intel Core i5,3.1,Intel High-End


In [6]:
numerical_columns = df.select_dtypes(include=["number"]).columns.tolist()
categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()

In [7]:
numerical_columns.remove("Price")

In [8]:
len(numerical_columns), len(categorical_columns)

(13, 6)

In [9]:
X = df.drop(columns=["Price"])
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1016, 19)
X_test shape: (255, 19)
y_train shape: (1016,)
y_test shape: (255,)


In [10]:
# Create a pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore", drop="first"),
            categorical_columns,
        ),
    ],
    remainder="passthrough",
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", LinearRegression())])

In [11]:
param_grid = [
    {
        "model": [LinearRegression()],
        "model__fit_intercept": [True, False]
    },
    {
        "model": [Ridge()],
        "model__alpha": [0.1, 1, 10]
    },
    {
        "model": [Lasso()],
        "model__alpha": [0.1, 1, 10]
    },
    {
        "model": [ElasticNet()],
        "model__alpha": [0.1, 1, 10],
        "model__l1_ratio": [0.1, 0.5, 0.9]
    },
    {
        "model": [SVR()],
        "model__kernel": ["linear", "rbf"],
        "model__C": [0.1, 1, 10],
        "model__degree": [2, 3, 4],
        "model__gamma": ["scale", "auto"],
        "model__epsilon": [0.1, 0.2, 0.5]
    },
    {
        "model": [RandomForestRegressor()],
        "model__n_estimators": [50, 100, 200],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["auto", "sqrt", "log2"]
    }
]


In [12]:
gsc = GridSearchCV(estimator=pipeline, param_grid=param_grid)

In [18]:
gsc.fit(X_train, np.log(y_train))

In [19]:
gsc.best_params_

{'model': RandomForestRegressor(),
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__n_estimators': 200}

In [20]:
gsc.best_score_

np.float64(0.8836144090089174)

In [24]:
y_pred = gsc.predict(X_test)

mae = mean_absolute_error(y_test, np.exp(y_pred))
r2 = r2_score(y_test, np.exp(y_pred))

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Absolute Error: 9342.857303077628
R-squared: 0.8330407020242897


In [25]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,HasIpsPanel,HasTouchScreen,ResWidth,ResHeight,ResCategory,Ppi,Ssd,Hdd,Flash,Hybrid,CpuCategory,CpuSpeedGhz,GpuCategory
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,1,0,2560,1600,High,226.98,128,0,0,0,Intel Core i5,2.3,Intel High-End
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,0,0,1440,900,Mid,127.68,0,0,128,0,Intel Core i5,1.8,Intel Low-End
2,HP,Notebook,15.6,8,No OS,1.86,30636,0,0,1920,1080,Mid,141.21,256,0,0,0,Intel Core i5,2.5,Intel Low-End
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,1,0,2880,1800,Ultra,220.53,512,0,0,0,Intel Core i7,2.7,AMD High-End
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,1,0,2560,1600,High,226.98,256,0,0,0,Intel Core i5,3.1,Intel High-End


In [28]:
df['Company'].value_counts()

Unnamed: 0_level_0,count
Company,Unnamed: 1_level_1
Dell,291
Lenovo,287
HP,268
Asus,151
Acer,101
MSI,54
Toshiba,48
Apple,21
Samsung,8
Mediacom,7


In [34]:
input_df = pd.DataFrame({
    "Company": ["HP"],
    "TypeName": ["Notebook"],
    "Inches": [15.6],
    "Ram": [8],
    "OpSys": ["Windows 10"],
    "Weight": [1.69],
    "HasIpsPanel": [1],
    "HasTouchScreen": [0],
    "ResWidth": [1920],
    "ResHeight": [1080],
    "ResCategory": ["Mid"],
    "Ppi": [150.56],
    "Ssd": [512],
    "Hdd": [0],
    "Flash": [0],
    "Hybrid": [0],
    "CpuCategory": ["Intel Core i5"],
    "CpuSpeedGhz": [4.4],
    "GpuCategory": ["Intel Mid-End"]
})

In [30]:
np.exp(gsc.predict(input_df))

array([49259.61300521])

In [None]:
import pickle

pickle.dump(gsc, open("model/laptop_price_predictor.pkl", "wb"))

In [None]:
laptop_price_predictor = pickle.load(open("model/laptop_price_predictor.pkl", "rb"))

In [35]:
np.exp(laptop_price_predictor.predict(input_df))

array([65088.82278552])