#Data Preparation

In [1]:
# Install the category_encoders library, which provides methods for encoding categorical variables.
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m242.3 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [13]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import sklearn as sk # For machine learning tools
from plotly import figure_factory # For creating interactive visualisation like heatmaps
import category_encoders as ce # For encoding categorical variables
from sklearn import decomposition # For dimensionality reduction
from sklearn import ensemble # For ensemble learning

# Load the dataset into a DataFrame
features = pd.read_csv('/content/drive/MyDrive/College/Applied Statistics & Machine Learning/CA 2/laptop_prices.csv')

# Separate the dataset into input features and the target variable ('Price_euros').
input = features.drop(['Price_euros', 'Product'], axis = 1) # Input features: all columns except 'Price_euros'
output = features['Price_euros'] # Target variable: 'Price_euros'

# Encode binary categorical columns ('Touchscreen', 'IPSpanel', 'RetinaDisplay') into numerical values (1 for 'Yes' and 0 for 'No')
input['Touchscreen'] = input['Touchscreen'].map({'Yes':1,'No':0})
input['IPSpanel'] = input['IPSpanel'].map({'Yes':1,'No':0})
input['RetinaDisplay'] = input['RetinaDisplay'].map({'Yes':1,'No':0})

# Encode ordinal categorical columns ('Screen', 'PrimaryStorageType', 'SecondaryStorageType') into numerical hierarchical values mentioned below
input['Screen'] = input['Screen'].map({'Standard' : 0,'Full HD' : 1,'Quad HD+' : 2,'4K Ultra HD' : 3})
input['PrimaryStorageType'] = input['PrimaryStorageType'].map({'Flash Storage' : 0,'HDD' : 1,'Hybrid' : 2,'SSD' : 3})
input['SecondaryStorageType'] = input['SecondaryStorageType'].map({'No' : 0,'HDD' : 1,'Hybrid' : 2,'SSD' : 3})

# use Target Encoding for the 'Company' column to convert categories into numerical values based on target mean.
te_input = ce.TargetEncoder(cols=['Company', 'TypeName', 'OS', 'CPU_model', 'GPU_model']).fit(input,output) # Fit TargetEnoder on input and output
te_input2 = te_input.transform(input) # Transform the 'Company' column into target-encoded values

'''
Perform one-hot encoding for remaining categorical columns: 'TypeName', 'Gpu_brand', and 'Os'.
This creates binary columns for each unique category, excluding the first category to avoid multicollinearity.
'''
after_te_input = pd.get_dummies(te_input2,['CPU_company','GPU_company'], dtype=int, drop_first=True)

# Calculate the correlation matrix for all input features.
corr_input  = after_te_input.corr()

# Create and display an annotated heatmap to visualise correlations between features.
f = figure_factory.create_annotated_heatmap(corr_input.values,list(corr_input.columns),list(corr_input.columns),corr_input.round(2).values,showscale=True)
f.show()

In [3]:
# Based on heatmap analysis, drop the 'Inches','ScreenH', 'ScreenW', 'SecondaryStorageType', 'GPU_company_ARM' column die to high correlation and high causation.
after_hm_input = after_te_input.drop(['Inches','ScreenH', 'ScreenW', 'SecondaryStorageType', 'GPU_company_ARM'], axis = 1)

# Extract numerical columns ('Ram', 'Weight', 'CPU_freq', 'PrimaryStorage', 'SecondaryStorage') for applying PCA.
num_cols = after_hm_input[['Ram', 'Weight', 'CPU_freq', 'PrimaryStorage', 'SecondaryStorage']]

# Perform Principal Component Analysis (PCA) to reduce the dimensionality of numerical features.
pca_model = decomposition.PCA(n_components = 2) # Retain only 1 principal component
pca_model.fit(num_cols) # Fit the PCA model on numerical columns

# Transform numerical features into the single principal component.
pca_num_cols = pca_model.transform(num_cols)

# Convert the PCA output into a DataFram for easy integration with other features.
pca_num_cols2 = pd.DataFrame(pca_num_cols, columns=['PCA1', 'PCA2'])

# Display the explained variance ratio for the retained components
print(f"Variance from the individual n_components: {pca_model.explained_variance_ratio_}")

# Display the total variance explained by all components (useful for analysis when n_components > 1).
print(f"Total variance from all the n_components: {sum(pca_model.explained_variance_ratio_)}")

Variance from the individual n_components: [0.63879085 0.3611376 ]
Total variance from all the n_components: 0.9999284463313114


In [4]:
# Remove the original numerical columns from the dataset as they are now represented by the PCA component.
after_hm_input2 = after_hm_input.drop(columns=num_cols)

# Concatenate the processed input features (after dropping 'num_cols') with the PCA component.
pca_input = pd.concat([after_hm_input2, pca_num_cols2], axis=1)

# Standardise the input features to have zero mean ad unit variance, whic is necessary for machine learning models.
pca_input_scaled = sk.preprocessing.StandardScaler().fit_transform(pca_input)

#Linear Regression

In [5]:
# Initialise a Stochastic Gradient Descent (SGD) Regressor for Linear Regression without regularisation.
linear_model = sk.linear_model.SGDRegressor(random_state = 1, penalty = None)
'''
'random_state = 1': Ensures reproducibility by setting a fixed random seed.
'penalty = None': No regularisation is applied (pure Linear Regression).
'''

#Define a hyperparameter grid for tuning the model.
model_hypm = {'eta0': [.00001, .0001, .001, .01, .1], 'max_iter':[50, 100, 150, 175, 200]}

# Define the number of cross-validation splits.
linear_model_cv = 10 # Perform 10-fold cross-validation to ensure robust evaluation.

# Perform GridSearchCV to find the best hyperparameters for the Linear Regression model.
linear_grid = sk.model_selection.GridSearchCV(estimator=linear_model, param_grid=model_hypm, scoring='r2', cv = linear_model_cv)
linear_grid.fit(pca_input_scaled, output) # Train the model using scaled input features and target variable.

# Retrieve and print the best hyperparameters found during GridSearchCV
linear_best_param = linear_grid.best_params_
print(f"\nThe Best Parameters Using Linear Regressor for this Dataset is {linear_best_param}")

# Retrieve and print the best R-squared score obtained using the optimal hyperparameters.
linear_best_result = linear_grid.best_score_
print(f"\nThe Best R-squared Score Using on the Above Best Parameters is {round(linear_best_result*100, 2)}%")

# Retrieve the best Linear Regression model trained during GridSearchCV.
linear_best_model = linear_grid.best_estimator_
print(f"\nThe Intercept β0 Using on the Best Parameters is {linear_best_model}")

# Calculate Adjusted R-squared to account for the number of features in the model.
linear_row, linear_col=pca_input_scaled.shape # Get the niumber of rows and columns in the dataset.
linear_modified_r2 = 1-(1-linear_best_result)*((linear_model_cv-1)/linear_model_cv*linear_row-1)/((linear_model_cv-1)/linear_model_cv*linear_row-linear_col-1)
print(f"\nAfter Modification the Best R-squared Score is {round(linear_modified_r2*100, 2)}%\n")

# Display the coefficients of the best model alongside feature names for interpretation
print(pd.DataFrame(zip(input.columns, linear_best_model.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached 


The Best Parameters Using Linear Regressor for this Dataset is {'eta0': 0.001, 'max_iter': 150}

The Best R-squared Score Using on the Above Best Parameters is 65.07%

The Intercept β0 Using on the Best Parameters is SGDRegressor(eta0=0.001, max_iter=150, penalty=None, random_state=1)

After Modification the Best R-squared Score is 64.71%

          Features  Coefficients
9      Touchscreen    199.224430
7          ScreenW    183.963863
1         TypeName    130.990792
8          ScreenH    116.045799
3              Ram     95.456896
2           Inches     95.385744
10        IPSpanel     53.517813
4               OS     51.261037
0          Company     36.695171
6           Screen     32.804170
12     CPU_company     10.578547
15  PrimaryStorage      9.581183
5           Weight      7.910388
11   RetinaDisplay      5.068118
13        CPU_freq    -19.287328
14       CPU_model    -68.325861


#Linear Regression with Regularisation

In [6]:
# Initialise an SGD Regressor for Linear Regression with ElasticNet regularisation.
linear_reg_model = sk.linear_model.SGDRegressor(random_state = 1, penalty = 'elasticnet')

# Define a hyperparameter grid for tuning the ElasticNet model.
linear_reg_model_hypm = {'eta0': [.0001, .001, .01, .1, 1], 'max_iter':[5, 10, 50, 75, 100],'alpha': [.01, .1, 1, 10, 100], 'l1_ratio': [0,0.25,0.5,0.75,1]}

# Define the number of cross-validation splits.
linear_reg_model_cv = 10

# Perform GridSearchCV to find the best hyperparameters for the ElasticNet model.
linear_reg_model_grid = sk.model_selection.GridSearchCV(estimator=linear_reg_model, param_grid=linear_reg_model_hypm, scoring='r2', cv=linear_reg_model_cv)
linear_reg_model_grid.fit(pca_input_scaled, output) # Train the model using scaled input features and target variable.

# Retrieve and print the best hyperparameters found during GridSearchCV.
linear_reg_best_param = linear_reg_model_grid.best_params_
print(f"\nThe Best Parameters using Linear Regression for this Dataset is {linear_reg_best_param}")

# Retrieve and print the best R-squared obtained using the optimal hyperparameters.
linear_reg_best_result = linear_reg_model_grid.best_score_
print(f"\nThe Best R-squared Score Using on the Best Parameters is {round(linear_reg_best_result*100, 2)}%")

# Retrieve the best Linear Regression model trained during GridSearchCV.
linear_reg_best_model = linear_reg_model_grid.best_estimator_

# Display the intercept (β0) of the best Elasticnet model.
print("Intercept β0: ", linear_reg_best_model.intercept_)
print(f"\nThe Intercept β0 Using on the Best Parameters is {linear_reg_best_model.intercept_}")

# Calculate Adjusted R-squared to account for the number of features in the model.
linear_reg_row, linear_reg_col=pca_input_scaled.shape # Get the number of rows and columns in the dataset.
linear_reg_modified_r2 = 1-(1-linear_reg_best_result)*((linear_reg_model_cv-1)/linear_reg_model_cv*linear_reg_row-1)/((linear_reg_model_cv-1)/linear_reg_model_cv*linear_reg_row-linear_reg_col-1)
print(f"\nAfter Modification the Best R-squared Score is {round(linear_reg_modified_r2*100, 2)}%")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing ma


The Best Parameters using Linear Regression for this Dataset is {'alpha': 1, 'eta0': 0.01, 'l1_ratio': 1, 'max_iter': 50}

The Best R-squared Score Using on the Best Parameters is 65.33%
Intercept β0:  [1145.34610723]

The Intercept β0 Using on the Best Parameters is [1145.34610723]

After Modification the Best R-squared Score is 64.97%



Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.



#Random Forest Regressor

In [7]:
# Initialise the Random Forest Regressor model.
rf_model = ensemble.RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=42)

# Define a hyperparamenter grid for tuning the number of trees in the forest.
rf_model_trees = {'n_estimators': [250, 275, 300, 325, 350]} # Number of decision trees in the forest.

# Perform GridSearchCV to find the best hyperparameters for the Random Forest model.
rf_model_cv = 10 # Use 19-fold cross-validation to evaluate the model.
rf_model_grid = sk.model_selection.GridSearchCV(estimator=rf_model, param_grid=rf_model_trees, scoring='r2', cv=rf_model_cv)
rf_model_grid.fit(pca_input_scaled, output) # Train the model using scaled input features and target variable.

# Retrieve and print the best number of trees found during GridSearchCV.
rf_best_param = rf_model_grid.best_params_
print(f"\nThe Best Number of Trees in Random Forest Regressor for this Dataset is {rf_best_param}")

# Retrieve and print the best R-squared score obtained using the optimal number of trees.
rf_best_result = rf_model_grid.best_score_
print(f"\nThe Best R-squared Score using the Number of Trees is {round(rf_best_result*100, 2)}%")

# Calculate Adjusted R-squared to account for the number of features in the model.
rf_row, rf_col=pca_input_scaled.shape # Get the number of rows and columns in the dataset.
rf_modified_r2 = 1-(1-rf_best_result)*((rf_model_cv-1)/rf_model_cv*rf_row-1)/((rf_model_cv-1)/rf_model_cv*rf_row-rf_col-1)
print(f"\nAfter Modification the Best R-squared Score is {round(rf_modified_r2*100, 2)}%")

# Extract and display the feature importance scores for the best Random Forest model.
rf_imp_feat = pd.Series(rf_model_grid.best_estimator_.feature_importances_, index=list(pca_input)).sort_values(ascending=False) # Getting feature importances list for the best model
print(f"\nThe Important Features are\n{rf_imp_feat}")


The Best Number of Trees in Random Forest Regressor for this Dataset is {'n_estimators': 300}

The Best R-squared Score using the Number of Trees is 90.29%

After Modification the Best R-squared Score is 90.19%

The Important Features are
CPU_model              0.233991
GPU_model              0.228891
TypeName               0.118037
PCA2                   0.092268
PCA1                   0.084535
PrimaryStorageType     0.072730
Screen                 0.054352
Company                0.042267
OS                     0.023990
Touchscreen            0.013829
GPU_company_Nvidia     0.013140
IPSpanel               0.010735
GPU_company_Intel      0.008933
CPU_company_Intel      0.001625
RetinaDisplay          0.000665
CPU_company_Samsung    0.000011
dtype: float64


In [8]:
# Create a new dataset using only the most important features from the previous Random Forest model.
new_pca_input = pca_input[['GPU_model', 'CPU_model', 'TypeName', 'PCA1', 'PCA2', 'PrimaryStorageType', 'Screen']]

# Scaled the new dataset to standardise feature values.
new_pca_input_scaled = sk.preprocessing.StandardScaler().fit_transform(new_pca_input)

# Initialise a new Random Forest Regressor model for the reduced feature set.
rf_imp_feat_model = sk.ensemble.RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=42)

# Define a hyperparameter grid for tuning the number of trees in the forest.
rf_imp_feat_model_trees = {'n_estimators': [550, 575, 600, 625, 650]} # Fine-tune the number of decision trees.

# Perform GridSearchCV to find the best hyperparameters for the updated Random Forest model.
rf_imp_feat_model_cv = 10 # Use 10-fold cross-validation for robust evaluation.
rf_imp_feat_model_grid = sk.model_selection.GridSearchCV(estimator=rf_imp_feat_model, param_grid=rf_imp_feat_model_trees, scoring='r2', cv=rf_imp_feat_model_cv)
rf_imp_feat_model_grid.fit(new_pca_input_scaled, output) # Train the model using scaled input features and target variable.

# Retrieve and print the best number of trees found during GridSearchCV for the reduced feature set.
rf_imp_feat_model_best_parameters = rf_imp_feat_model_grid.best_params_
print(f"\nThe Best Number of Trees in Random Forest Regressor using the Important Feature is {rf_imp_feat_model_best_parameters}")

# Retrieve and print the best R-squared score obtained using the optimal number of trees.
rf_imp_feat_model_best_result = rf_imp_feat_model_grid.best_score_
print(f"\nThe Best R-squared Score Using on the Above Number of Trees is {round(rf_imp_feat_model_best_result*100, 2)}%")

# Calculate Adjusted R-squared for the reduced feature set.
rf_imp_feat_row, rf_imp_feat_col=new_pca_input_scaled.shape
rf_imp_feat_modified_r2 = 1-(1-rf_imp_feat_model_best_result)*((rf_imp_feat_model_cv-1)/rf_imp_feat_model_cv*rf_imp_feat_row-1)/((rf_imp_feat_model_cv-1)/rf_imp_feat_model_cv*rf_imp_feat_row-rf_imp_feat_col-1)
print(f"\nAfter Modification the Best R-squared Score is {round(rf_imp_feat_modified_r2*100, 2)}%")


The Best Number of Trees in Random Forest Regressor using the Important Feature is {'n_estimators': 600}

The Best R-squared Score Using on the Above Number of Trees is 88.29%

After Modification the Best R-squared Score is 88.24%


#Support Vector Regressor

In [9]:
# Initialise the Support Vector Regressor (SVR) model.
svr_model = sk.svm.SVR(max_iter=100000)
# The default SVR parameters will be overridden during hyperparameter tuning with GridSearchCV.

# Define the number of cross-validation folds.
svr_model_cv=5 # Use 5-fold cross-validation for robust evaluation of the model.

# Define a hyperparameter grid for tuning the SVR model.
svr_model_hypm = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [4500, 5000, 5500, 6000, 6500], 'epsilon': [1, 10, 100, 1000, 5000]}

# Perform GridSearchCV to find the best hyperparameters for the SVR model.
svr_model_grid = sk.model_selection.GridSearchCV(estimator=svr_model, param_grid=svr_model_hypm, scoring='r2', cv=svr_model_cv)
svr_model_grid.fit(new_pca_input_scaled, output) # Train the model using scaled input features and the target variable.

# Retrieve and print the best hyperparameters found dring GridSearchCV.
svr_best_param = svr_model_grid.best_params_
print(f"\nThe Best Parameters Using Support Vector Regressor for this Dataset is {svr_best_param}")

# Retrieve and print the best R-squared score obtained using the optimal hyperparameters.
svr_best_result = svr_model_grid.best_score_
print(f"\nThe Best R-squared Score Using on the Above Best Parameters is {round(svr_best_result*100, 2)}%")

# Calculate Adjusted R-squared to account for the number of features in the dataset.
svr_row, svr_col=new_pca_input_scaled.shape # Get the number of rows and columns in the scaled dataset.
svr_modified_r2 = 1-(1-svr_best_result)*((svr_model_cv-1)/svr_model_cv*svr_row-1)/((svr_model_cv-1)/svr_model_cv*svr_row-svr_col-1)
print(f"\nAfter Modification the Best R-squared Score is {round(svr_modified_r2*100, 2)}%")


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.


Solver terminated early (max_iter=100000).  Consider pre-proces


The Best Parameters Using Support Vector Regressor for this Dataset is {'C': 5500, 'epsilon': 100, 'kernel': 'rbf'}

The Best R-squared Score Using on the Above Best Parameters is 77.79%

After Modification the Best R-squared Score is 77.67%


#Evaluation

In [10]:
# Print the modified R-squared scores for Linear Regression without regularisation.
print(f"\nLinear Regression Modified Best R-squared Score: {round(linear_modified_r2*100, 2)}%")
'''
'linear_modified_r2': Adjusted R-squared score for the Linear Regression model without regularisation.
The score reflects how well the model explains the variance in the target variable, adjusted for the number of features used.
'''

# Print the modified R-squared scores for Linear Regression with ElasticNet regularisation.
print(f"\nLinear Regression With Regularisation Modified Best R-squared Score: {round(linear_reg_modified_r2*100, 2)}%")
'''
'linear_reg_modified_r2': Adjusted R-squared score for the Linear Regression model with ElasticNet regularisation.
Regularisation prevents overfitting by penalising large coefficients, especially useful for noisy datasets.
'''

# Print the modified R-squared scores for Random Forest Regressor.
print(f"\nRandom Forest Regression Best R-squared Score: {round(rf_imp_feat_modified_r2*100, 2)}%")
'''
'rf_modified_r2': Adjusted R-squared score for the Random Forest model, which aggregates prediction from multiple decision trees.
This model is often effective in capturing complex, non-linear relationships.
'''

# Print the modified R-squared scored for Support Vector Regressor.
print(f"\nSupport Vector Regression Best R-squared Score: {round(svr_modified_r2*100, 2)}%")
'''
'svr_modified_r2': Adjused R-squared scores for the Support Vector Regressor, a powerful model for handling both linear and non-linear patterns.
'''

# Find the model with the highest adjusted R-squared score.
recommendation = max(linear_modified_r2, linear_reg_modified_r2, rf_imp_feat_modified_r2, svr_modified_r2)

# Recommend the model with the highest adjusted R-squared score.
if recommendation == linear_modified_r2:
  print(f"\nRecommendation: The Linear Regression model is preferred for deployment based on higher adjusted R-squared score.")
  # If the Linear Regression model without regularisation has the highest score, it is recommended for deployment.
  best_model = linear_best_model.best_estimator_

elif recommendation == linear_reg_modified_r2:
  print(f"\nRecommendation: The Linear Regression With Regularisation model is preferred for deployment based on higher adjusted R-squared score.")
  # If the Linear Regression model with ElasticNet regularisation has the highest score, it is recommended for deployment.
  best_model = linear_reg_best_model.best_estimator_

elif recommendation == rf_imp_feat_modified_r2:
  print(f"\nRecommendation: The Random Forest Regression model is preferred for deployment based on higher adjusted R-squared score.")
  # If the Random Forest model has the highest score, it is recommended for deployment.
  best_model = rf_imp_feat_model_grid.best_estimator_

else:
  print(f"\nRecommendation: The Support Vector Regression model is preferred for deployment based on higher adjusted R-squared score.")
  # If the Support Vector model has the highest score, it is recommended for deployment.
  best_model = svr_model_grid.best_estimator_


Linear Regression Modified Best R-squared Score: 64.71%

Linear Regression With Regularisation Modified Best R-squared Score: 64.97%

Random Forest Regression Best R-squared Score: 88.24%

Support Vector Regression Best R-squared Score: 77.67%

Recommendation: The Random Forest Regression model is preferred for deployment based on higher adjusted R-squared score.


#Best Model Implementation

In [11]:
import joblib

preprocessor = sk.compose.ColumnTransformer(transformers=[('binary', 'passthrough', ['Touchscreen', 'IPSpanel', 'RetinaDisplay']),
                                                          ('ordinal', 'passthrough', ['Screen', 'PrimaryStorageType', 'SecondaryStorageType']),
                                                          ('target', ce.TargetEncoder(cols=['Company', 'TypeName', 'OS', 'CPU_model', 'GPU_model']), ['Company', 'TypeName', 'OS', 'CPU_model', 'GPU_model']),
                                                          ['one_hot', sk.preprocessing.OneHotEncoder(drop='first'), ['CPU_company', 'GPU_company']],
                                                          ('scaler', sk.preprocessing.StandardScaler(), ['Inches', 'Ram', 'Weight', 'ScreenW', 'ScreenH', 'CPU_freq', 'PrimaryStorage', 'SecondaryStorage'])])

pipeline = sk.pipeline.Pipeline(steps=[('preprocessor', preprocessor),
                                       ('model', best_model)])

pipeline.fit(input, output)

joblib.dump(pipeline, 'pc_price_prediction_model.pkl')

print(input.columns.tolist())

['Company', 'TypeName', 'Inches', 'Ram', 'OS', 'Weight', 'Screen', 'ScreenW', 'ScreenH', 'Touchscreen', 'IPSpanel', 'RetinaDisplay', 'CPU_company', 'CPU_freq', 'CPU_model', 'PrimaryStorage', 'SecondaryStorage', 'PrimaryStorageType', 'SecondaryStorageType', 'GPU_company', 'GPU_model']


In [12]:
pc_price_prediction_model = joblib.load('pc_price_prediction_model.pkl')

user_input = [['Apple', 'Ultrabook', 16, 32, 'MacOS', 1.4, 3, 3072, 1920, 1, 1, 1, 'Intel', 3.1, 'Core i7', 512, 0, 3, 0, 'AMD', 'Radeon Pro 560']]

user_input_df = pd.DataFrame(user_input, columns=input.columns.tolist())

prediction = pc_price_prediction_model.predict(user_input_df)
print(f'Predicted Price: €{round(prediction[0], 2)}')

Predicted Price: €3137.12
