In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [7]:
# Load your processed dataset
df = pd.read_csv("engineered_furniture_data.csv")

# Quick check
df.head()


Unnamed: 0,price,sold,tagText,shipping_price,tagText_encoded,final_price,log_sold,adjustable,and,bed,...,up,upholstered,vanity,velvet,wardrobe,white,wicker,with,wood,wooden
0,46.79,600,Free shipping,0.0,1,46.79,6.398595,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.299913,0.0,0.0,0.119967,0.0,0.0
1,169.72,0,Free shipping,0.0,1,169.72,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.30009,0.0,0.0,0.0
2,39.46,7,Free shipping,0.0,1,39.46,2.079442,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126664,0.0,0.0
3,111.99,0,Free shipping,0.0,1,111.99,0.0,0.0,0.0,0.0,...,0.0,0.339159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21.37,1,Free shipping,0.0,1,21.37,0.693147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.550932


In [8]:
# Drop non-numeric or unused columns
drop_cols = [col for col in ['sold', 'log_sold', 'tagText', 'productTitle'] if col in df.columns]
X = df.drop(columns=drop_cols)

# Target variable
y = df['log_sold']


In [13]:
from sklearn.impute import SimpleImputer

# Fill missing values with mean (or use strategy='median' / 'constant')
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42
)


In [15]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("🔹 Linear Regression")
print(f"  MSE: {mse_lr:.4f}")
print(f"  R² Score: {r2_lr:.4f}")


🔹 Linear Regression
  MSE: 1.5539
  R² Score: 0.1658


In [16]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("🔹 Random Forest Regressor")
print(f"  MSE: {mse_rf:.4f}")
print(f"  R² Score: {r2_rf:.4f}")


🔹 Random Forest Regressor
  MSE: 1.2961
  R² Score: 0.3042


In [17]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("🔹 Gradient Boosting Regressor")
print(f"  MSE: {mse_gb:.4f}")
print(f"  R² Score: {r2_gb:.4f}")


🔹 Gradient Boosting Regressor
  MSE: 1.3175
  R² Score: 0.2927
