In [76]:
import numpy as np
import pandas as pd

In [77]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

In [78]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [79]:
df = pd.read_csv("data/Final_Dataset.csv")
df

Unnamed: 0,Company,TypeName,Ram,Weight,Price,Touchscreen,Ips,Ppi,Cpu brand,HDD,SSD,Gpu brand,Os
0,Apple,Ultrabook,8,1.37,71378,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895,0,0,127.677940,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,30636,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No OS/Linux
3,Apple,Ultrabook,16,1.83,135195,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095,0,1,226.983005,Intel Core i5,0,256,Intel,Mac
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,Lenovo,2 in 1 Convertible,4,1.80,33992,1,1,157.350512,Intel Core i7,0,128,Intel,Windows
1298,Lenovo,2 in 1 Convertible,16,1.30,79866,1,1,276.053530,Intel Core i7,0,512,Intel,Windows
1299,Lenovo,Notebook,2,1.50,12201,0,0,111.935204,Other Intel Processor,0,0,Intel,Windows
1300,HP,Notebook,6,2.19,40705,0,0,100.454670,Intel Core i7,1000,0,AMD,Windows


In [80]:
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

In [81]:
X.head(5)

Unnamed: 0,Company,TypeName,Ram,Weight,Touchscreen,Ips,Ppi,Cpu brand,HDD,SSD,Gpu brand,Os
0,Apple,Ultrabook,8,1.37,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No OS/Linux
3,Apple,Ultrabook,16,1.83,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [82]:
y.head(5)

0    11.175745
1    10.776766
2    10.329931
3    11.814473
4    11.473093
Name: Price, dtype: float64

In [83]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=2)

In [84]:
def save_result(result):
    with open('result.txt', 'a') as file:  # Open in append mode
        file.write(result + '\n')  # Write result with a new line


# Linear Regression

In [85]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = LinearRegression()

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))  # Avoid overriding function name
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('LinearRegression:\n' + r2_value + mae_value)


r2_score = 0.8073

mae = 0.2102



# Ridge Regression

In [86]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])  # One-hot encode categorical columns
], remainder='passthrough')

# Define the Ridge regression model with alpha=10
step2 = Ridge(alpha=10)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))  # Compute R-squared score
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))  # Compute Mean Absolute Error

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Ridge Regression:\n' + r2_value + mae_value)


r2_score = 0.8127

mae = 0.2093



# Lasso Regression

In [87]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = Lasso(alpha=0.001)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Lasso Regression:\n' + r2_value + mae_value)

r2_score = 0.8072

mae = 0.2111



# KNN

In [88]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = KNeighborsRegressor(n_neighbors=5)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('KNN Regression:\n' + r2_value + mae_value)

r2_score = 0.8045

mae = 0.1991



# Decision Tree

In [89]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = DecisionTreeRegressor(random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Decision Tree Regression:\n' + r2_value + mae_value)


r2_score = 0.7806

mae = 0.2085



# SVM

In [90]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = SVR(kernel='rbf')

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Support Vector Regression (SVR):\n' + r2_value + mae_value)


r2_score = 0.6111

mae = 0.2959



# Random Forest

In [91]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Random Forest Regressor:\n' + r2_value + mae_value)


r2_score = 0.8857

mae = 0.1565



# Extra Trees

In [92]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Extra Trees Regressor:\n' + r2_value + mae_value)

r2_score = 0.8687

mae = 0.1599



# AdaBoost

In [93]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = AdaBoostRegressor(n_estimators=50, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('AdaBoost Regressor:\n' + r2_value + mae_value)

r2_score = 0.7996

mae = 0.2271



# Gradient Boost

In [94]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Gradient Boosting Regressor:\n' + r2_value + mae_value)

r2_score = 0.8672

mae = 0.1754



# Xg Boost

In [95]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('XGBoost Regressor:\n' + r2_value + mae_value)

r2_score = 0.8881

mae = 0.1545



# Voting Regression

In [96]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the base models
model1 = LinearRegression()
model2 = Ridge(alpha=10)
model3 = Lasso(alpha=0.001)
model4 = RandomForestRegressor(n_estimators=100, random_state=42)
model5 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Define the ensemble model using Voting Regressor
step2 = VotingRegressor(estimators=[
    ('lr', model1),
    ('ridge', model2),
    ('lasso', model3),
    ('rf', model4),
    ('xgb', model5)
])

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Voting Regressor:\n' + r2_value + mae_value)

r2_score = 0.8654

mae = 0.1771



# Stacking

In [97]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define base models
model1 = LinearRegression()
model2 = Ridge(alpha=10)
model3 = Lasso(alpha=0.001)
model4 = RandomForestRegressor(n_estimators=100, random_state=42)
model5 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Define meta-model
meta_model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=42)

# Define Stacking Regressor
step2 = StackingRegressor(estimators=[
    ('lr', model1),
    ('ridge', model2),
    ('lasso', model3),
    ('rf', model4),
    ('xgb', model5)
], final_estimator=meta_model)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate metrics
r2_value = 'r2_score = {:.4f}\n'.format(r2_score(y_test, y_pred))  
mae_value = 'mae = {:.4f}\n'.format(mean_absolute_error(y_test, y_pred))

# Print results
print(r2_value)
print(mae_value)

# Save results
save_result('Stacking Regressor:\n' + r2_value + mae_value)


r2_score = 0.8867

mae = 0.1600



# Exporting the model

In [None]:
import pickle

In [101]:
# Define the preprocessing step
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse_output=False, drop='first'), [0,1,7,10,11])
], remainder='passthrough')

# Define the model
step2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Create a pipeline
pipe = Pipeline([('step1', step1), ('step2', step2)])

# Train the model
pipe.fit(X_train, y_train)

# Save the trained pipeline to a file
with open('xgboost_pipeline.pkl', 'wb') as file:
    pickle.dump(pipe, file)

print("Model saved successfully as 'xgboost_pipeline.pkl'")


Model saved successfully as 'xgboost_pipeline.pkl'


In [103]:
with open('model/xgboost_pipeline.pkl', 'rb') as file:
    loaded_pipe = pickle.load(file)

# Use the loaded model for prediction
new_predictions = loaded_pipe.predict(X_test)
print("Predictions:", new_predictions)


Predictions: [10.580947   9.73939    9.841462  10.129512  11.397754  11.528799
 10.138639  10.479771  11.105549   9.716559  11.146655  10.898071
 11.46136   10.821007  11.468856  10.998773  11.021063   9.744024
 11.5392275 11.566503  11.50012   11.269388  10.558029  10.937949
 10.218255  11.576468  10.900183  10.054203  11.005717  10.536584
 11.134018  10.018679  10.859791  10.870575  11.007909  10.72758
 11.029966  11.053832  11.426454  11.396256  11.450376  11.165958
  9.8561945  9.743316  10.217672   9.758047  10.85878   11.662454
 11.391047  11.03015   11.698503  11.532744   9.907879   9.306399
 10.494925  10.841766  10.075022  11.162067  10.253377  10.345437
 11.187673  11.0127     9.489701  11.149599  10.94297   10.426765
  9.902262  11.178947  10.813228   9.766197  10.956848  10.593768
 10.536826  10.495383  11.232699  10.880398  11.276783  11.944663
 11.25757   10.549853  10.854066  11.048683  11.426963  10.126625
 11.130385  10.401399  11.031125  11.398439  10.100945  10.70301

In [104]:
pickle.dump(df, open('model/df.pkl', 'wb'))