# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
data = pd.read_csv("laptopPrice.csv")

In [3]:
data.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   brand              823 non-null    object
 1   processor_brand    823 non-null    object
 2   processor_name     823 non-null    object
 3   processor_gnrtn    823 non-null    object
 4   ram_gb             823 non-null    object
 5   ram_type           823 non-null    object
 6   ssd                823 non-null    object
 7   hdd                823 non-null    object
 8   os                 823 non-null    object
 9   os_bit             823 non-null    object
 10  graphic_card_gb    823 non-null    object
 11  weight             823 non-null    object
 12  warranty           823 non-null    object
 13  Touchscreen        823 non-null    object
 14  msoffice           823 non-null    object
 15  Price              823 non-null    int64 
 16  rating             823 non-null    object
 1

# Data Exploration:

In [5]:
data.drop(columns=['Number of Ratings', 'Number of Reviews', 'rating'], inplace=True)

In [6]:
data['ram_gb'] = data['ram_gb'].str.replace('GB', '').astype(int)
data['ssd'] = data['ssd'].str.replace('GB', '').astype(int)
data['hdd'] = data['hdd'].str.replace('GB', '').astype(int)
data['graphic_card_gb'] = data['graphic_card_gb'].str.replace('GB', '').astype(int)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823 entries, 0 to 822
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            823 non-null    object
 1   processor_brand  823 non-null    object
 2   processor_name   823 non-null    object
 3   processor_gnrtn  823 non-null    object
 4   ram_gb           823 non-null    int32 
 5   ram_type         823 non-null    object
 6   ssd              823 non-null    int32 
 7   hdd              823 non-null    int32 
 8   os               823 non-null    object
 9   os_bit           823 non-null    object
 10  graphic_card_gb  823 non-null    int32 
 11  weight           823 non-null    object
 12  warranty         823 non-null    object
 13  Touchscreen      823 non-null    object
 14  msoffice         823 non-null    object
 15  Price            823 non-null    int64 
dtypes: int32(4), int64(1), object(11)
memory usage: 90.1+ KB


In [8]:
data.isnull().sum()

brand              0
processor_brand    0
processor_name     0
processor_gnrtn    0
ram_gb             0
ram_type           0
ssd                0
hdd                0
os                 0
os_bit             0
graphic_card_gb    0
weight             0
warranty           0
Touchscreen        0
msoffice           0
Price              0
dtype: int64

# Feature Engineering:

In [49]:
categorical_features = ['brand', 'processor_brand', 'processor_name', 'ram_type', 'os', 'os_bit', 'warranty']

In [50]:
numeric_features = ['ram_gb', 'ssd', 'hdd', 'graphic_card_gb']

In [51]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [52]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [53]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Model Training:

In [54]:
X = data.drop(columns=['Price'])
y = data['Price']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
rf_pipeline.fit(X_train, y_train)

In [57]:
rf_predictions = rf_pipeline.predict(X_test)

In [58]:
rf_predictions

array([118114.43333333,  42097.86984127,  39656.18233333,  42044.53      ,
        73242.41566667,  74642.23647154, 160160.95238095,  44231.0534743 ,
        58187.51388961,  56043.36183333, 153711.78809524,  60941.96587302,
        86603.33333333,  49286.625     ,  56083.72571429,  49036.05396825,
        43248.29388889,  61864.35333333,  56541.53928571,  71596.7518872 ,
        96214.85790043,  63792.29370635,  67956.83352481,  55776.00904762,
        70159.28583333,  65664.93333333,  53522.76333333,  65137.01469272,
        39145.03333333,  41758.20174603,  71596.7518872 ,  92737.81212121,
        60252.18666667,  42299.78452381,  68022.49693074, 143226.14333333,
        42722.65266667,  38964.37464286, 125937.3675    ,  37211.91383333,
        66439.5939881 ,  44752.99452381,  63954.91833333, 161180.45      ,
        57143.78028752,  36439.47666667,  65137.01469272,  56380.29906025,
        37034.25310606,  82246.19      , 121006.07552381, 143226.14333333,
        74642.23647154,  

In [59]:
X_train.head(1)

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice
239,ASUS,Intel,Core i5,11th,16,DDR4,512,0,Windows,64-bit,0,Casual,1 year,No,No


In [71]:
input_df = pd.DataFrame(
[['Lenovo','AMD','Core i5','12th',16,'DDR4',256,0,'Windows','64-bit',64,'ThinNlight','No','No','Yes']],
columns=['brand','processor_brand','processor_name','processor_gnrtn','ram_gb','ram_type','ssd','hdd','os','os_bit','graphic_card_gb','weight','warranty','Touchscreen','msoffice'])

In [72]:
input_df

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice
0,Lenovo,AMD,Core i5,12th,16,DDR4,256,0,Windows,64-bit,64,ThinNlight,No,No,Yes


In [73]:
 rf_pipeline.predict(input_df)

array([81370.06666667])

# Model Evaluation:

In [74]:
rf_mae = mean_absolute_error(y_test, rf_predictions)

In [75]:
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

In [76]:
print(f'Random Forest MAE: {rf_mae}')
print(f'Random Forest RMSE: {rf_rmse}')

Random Forest MAE: 15630.469038239387
Random Forest RMSE: 26423.818614490887


In [77]:
def cross_validate_rmse(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    return rmse_scores


In [78]:
cv_scores_rf = cross_validate_rmse(rf_pipeline, X, y)
print(f'Random Forest Cross-Validation RMSE Scores: {cv_scores_rf}')
print(f'Mean Random Forest RMSE: {np.mean(cv_scores_rf)}')

Random Forest Cross-Validation RMSE Scores: [26859.50554894225, 18961.245580170806, 20032.43239747986, 30033.3434552938, 22520.70928450653]
Mean Random Forest RMSE: 23681.44725327865
