In [27]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values

In [2]:
import numpy as np
import pandas as pd

In [4]:
## Data ingestion
df = pd.read_csv("data\cleaned_data.csv")
df.head(6)

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,IPS,PPI,Cpu_Brand,HDD,SSD,Gpu_brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,30636.0,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No OS/Linux
3,Apple,Ultrabook,16,1.83,135195.336,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095.808,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
5,Acer,Notebook,4,2.1,21312.0,0,0,100.45467,AMD Processor,500,0,AMD,Windows


In [5]:
## Independent and Dependent column
X  = df.drop(labels=['Price'], axis=1)
Y  = df['Price']

In [6]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [21]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler(with_mean=False))
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ('ordinal',OneHotEncoder(sparse=False,drop="first")),
          ('scalar',StandardScaler(with_mean=False))
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=42)

In [23]:
X_train = pd.DataFrame(proccessor.fit_transform(X_train), columns=proccessor.get_feature_names_out())
X_test = pd.DataFrame(proccessor.transform(X_test), columns= proccessor.get_feature_names_out())



In [24]:
X_train

Unnamed: 0,num_pipeline__Ram,num_pipeline__Weight,num_pipeline__TouchScreen,num_pipeline__IPS,num_pipeline__PPI,num_pipeline__HDD,num_pipeline__SSD,cat_pipeline__Company_Apple,cat_pipeline__Company_Asus,cat_pipeline__Company_Chuwi,...,cat_pipeline__TypeName_Ultrabook,cat_pipeline__TypeName_Workstation,cat_pipeline__Cpu_Brand_Intel Core i3,cat_pipeline__Cpu_Brand_Intel Core i5,cat_pipeline__Cpu_Brand_Intel Core i7,cat_pipeline__Cpu_Brand_Other Intel Processor,cat_pipeline__Gpu_brand_Intel,cat_pipeline__Gpu_brand_Nvidia,cat_pipeline__os_Others/No OS/Linux,cat_pipeline__os_Windows
0,1.571126,2.165828,0.000000,0.000000,3.702241,0.000000,1.414793,0.0,0.0,0.0,...,0.000000,0.0,0.0,2.165185,0.000000,0.00000,2.010504,0.000000,0.000000,2.868271
1,4.713377,1.999226,0.000000,0.000000,4.936322,0.000000,2.829585,0.0,0.0,0.0,...,2.733506,0.0,0.0,0.000000,2.031984,0.00000,2.010504,0.000000,0.000000,2.868271
2,1.571126,3.332044,0.000000,0.000000,3.322524,0.000000,1.414793,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,2.031984,0.00000,0.000000,0.000000,3.045341,0.000000
3,1.571126,3.983307,0.000000,0.000000,2.496695,2.411146,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,2.165185,0.000000,0.00000,0.000000,2.176091,0.000000,2.868271
4,3.142251,2.574761,0.000000,0.000000,3.702241,0.000000,1.414793,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,2.031984,0.00000,0.000000,2.176091,0.000000,2.868271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,1.571126,3.634957,0.000000,0.000000,3.322524,0.000000,1.414793,0.0,0.0,0.0,...,0.000000,0.0,0.0,2.165185,0.000000,0.00000,0.000000,2.176091,0.000000,2.868271
907,1.571126,3.089713,0.000000,0.000000,2.363560,4.822292,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,2.031984,0.00000,2.010504,0.000000,0.000000,2.868271
908,1.571126,3.483500,0.000000,0.000000,2.363560,2.411146,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,2.031984,0.00000,0.000000,0.000000,3.045341,0.000000
909,1.571126,3.483500,0.000000,2.216893,3.322524,2.411146,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,2.165185,0.000000,0.00000,2.010504,0.000000,0.000000,2.868271


In [26]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))

    return mae,mse,r2_square

In [30]:
## Train multiple models
## Model Evaluation

models  = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge": Ridge(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "ElasticNet": ElasticNet(),
    "KNeighborsRegressor":KNeighborsRegressor(n_neighbors=3),
    "RandomForestRegressor":RandomForestRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(n_estimators=500),
    "AdaBoostRegressor":AdaBoostRegressor(n_estimators=15,learning_rate=1.0),
    "ExtraTreesRegressor":ExtraTreesRegressor(),
    "SVR":SVR(kernel='rbf',C=10000,epsilon=0.1),
    "XGBRegressor":XGBRegressor(n_estimators=45,max_depth=5,learning_rate=0.5)
    
    
}

train_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_test, y_test)

    # Make Prediction

    y_pred = model.predict(X_test)

    mae, rmse, r2_square =  evaluate_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("mea", mae)
    print("rmse", rmse)
    print("R2 Score", r2_square)


    r2_list.append(r2_square)

    print("="*35)

    print("\n")

LinearRegression
Model Training Performance
mea 12624.135202046036
rmse 336689676.2371898
R2 Score 0.7798696235642666


Lasso
Model Training Performance
mea 12622.786345577453
rmse 336688679.93677455
R2 Score 0.7798702749533674


Ridge
Model Training Performance
mea 12617.531349563815
rmse 336699138.3930525
R2 Score 0.7798634371318365


DecisionTreeRegressor
Model Training Performance
mea 233.34187212276214
rmse 1287336.7351125702
R2 Score 0.999158329048675


ElasticNet
Model Training Performance
mea 13596.013254692969
rmse 383000583.4549626
R2 Score 0.7495911857076009


KNeighborsRegressor
Model Training Performance
mea 9674.887635805626
rmse 233425049.6832109
R2 Score 0.847384854116833


RandomForestRegressor
Model Training Performance
mea 4502.696891580852
rmse 58133250.787544996
R2 Score 0.9619920203009727


GradientBoostingRegressor
Model Training Performance
mea 3457.8131670375024
rmse 22294529.56563213
R2 Score 0.9854236600284625


AdaBoostRegressor
Model Training Performance
me