In [19]:
import pandas as pd
import numpy as np


In [20]:
df = pd.read_csv("../data_source/laptop_data_preprocessed.csv")
x_train = pd.read_csv("../data_split/x_train.csv")
x_test = pd.read_csv("../data_split/x_test.csv")
y_train = pd.read_csv("../data_split/y_train.csv").squeeze()
y_test = pd.read_csv("../data_split/y_test.csv").squeeze()


In [21]:
import pickle

with open("../transformers/column_transformer.pkl", "rb") as f:
    step1 = pickle.load(f)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [22]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor



models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=15),
    'KNN': KNeighborsRegressor(n_neighbors=3),
    'Decision Tree': DecisionTreeRegressor(max_depth=8),
    'SVM': SVR(kernel='rbf', C=10000, epsilon=0.1),
    'Random Forest': RandomForestRegressor(
        n_estimators=100, random_state=3, max_samples=0.5,
        max_features=0.75, max_depth=15
    ),
    'AdaBoost': AdaBoostRegressor(n_estimators=15, learning_rate=1.0),
    'Gradient Boost': GradientBoostingRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(max_depth=5, learning_rate=0.5)
}


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define categorical columns
categorical_cols = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']

# Define ColumnTransformer
step1 = ColumnTransformer(
    transformers=[
        ('col_tnf', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit transformer on training data
step1.fit(x_train)

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('transformer', step1),
        ('model', model)
    ])
    
    # Train model
    pipe.fit(x_train, y_train)
    
    # Predict
    y_pred = pipe.predict(x_test)
    
    # Evaluate
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    results[name] = {'r2': r2, 'mae': mae}
    
    print(f"{name}")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE Score: {mae:.4f}")
    print("-" * 50)


Linear Regression
R² Score: 0.7969
MAE Score: 0.2039
--------------------------------------------------
Ridge Regression
R² Score: 0.7859
MAE Score: 0.2103
--------------------------------------------------
KNN
R² Score: 0.7710
MAE Score: 0.2115
--------------------------------------------------
Decision Tree
R² Score: 0.8074
MAE Score: 0.1950
--------------------------------------------------
SVM
R² Score: 0.8248
MAE Score: 0.1901
--------------------------------------------------
Random Forest
R² Score: 0.8646
MAE Score: 0.1638
--------------------------------------------------
AdaBoost
R² Score: 0.7467
MAE Score: 0.2379
--------------------------------------------------
Gradient Boost
R² Score: 0.8455
MAE Score: 0.1783
--------------------------------------------------
XGBoost
R² Score: 0.8708
MAE Score: 0.1572
--------------------------------------------------


In [24]:
results_df = pd.DataFrame(results).T.reset_index()

results_df.rename(columns={'index': 'Model Name'}, inplace=True)

results_df = results_df.sort_values(by='r2', ascending=False)

results_df.to_csv("../model_evaluation/model_performance.csv", index=False)
print("Model evaluation complete! Saved 'model_performance.csv'.")

Model evaluation complete! Saved 'model_performance.csv'.


In [25]:
pickle.dump(pipe, open('../models/pipe.pkl', 'wb'))
df_for_app = df.reset_index(drop=True)  # Remove any index column
pickle.dump(df_for_app, open('../models/data.pkl', 'wb'))
