In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
# 2. Load Dataset
df = pd.read_csv("../data/ecommerce_furniture_dataset.csv")
print("Initial Shape:", df.shape)
df.head()

In [None]:
# 3. Data Preprocessing
print("Missing values:\
", df.isnull().sum())
df.drop(columns=['originalPrice'], inplace=True)
df.dropna(inplace=True)
df['price'] = df['price'].replace('[\$,,]', '', regex=True).astype(float)
df['tagText'] = df['tagText'].apply(lambda x: x if x in ['Free shipping', '+Shipping: $5.09'] else 'others')
le = LabelEncoder()
df['tagText'] = le.fit_transform(df['tagText'])

In [None]:
# 4. Exploratory Data Analysis
sns.histplot(df['price'], kde=True)
plt.title('Price Distribution')
plt.show()
sns.histplot(df['sold'], kde=True)
plt.title('Sold Units Distribution')
plt.show()
sns.scatterplot(x='price', y='sold', data=df)
plt.title('Price vs Sold Units')
plt.show()

In [None]:
# 5. Feature Engineering
tfidf = TfidfVectorizer(max_features=50)
title_features = tfidf.fit_transform(df['productTitle'])
title_df = pd.DataFrame(title_features.toarray(), columns=tfidf.get_feature_names_out())
df = pd.concat([df.reset_index(drop=True), title_df], axis=1)
df.drop(columns=['productTitle'], inplace=True)

In [None]:
# 6. Train/Test Split
X = df.drop(columns=['sold'])
y = df['sold']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 7. Model Training
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

In [None]:
# 8. Evaluation
y_pred_lr = lr_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
print("Linear Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2:", r2_score(y_test, y_pred_lr))
print("\nRandom Forest:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R2:", r2_score(y_test, y_pred_rf))