# Furniture Dataset Analytics

This notebook performs exploratory data analysis (EDA) on the furniture dataset to understand its structure, distributions, and key insights. We'll clean the data, handle missing values, and prepare it for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
df = pd.read_csv('../data/furniture_dataset.csv')
print(f"Dataset shape: {df.shape}")
print("\nColumns:")
for col in df.columns:
    print(f"- {col}")

print("\nFirst 5 rows:")
df.head()

In [None]:
# Data types and missing values
print("Data types and missing values:")
df.info()

print("\nMissing values count:")
df.isnull().sum()

## 2. Data Cleaning and Preprocessing

In [None]:
# Clean price column - remove $ and convert to float
df['price'] = df['price'].str.replace('$', '').astype(float)

# Parse categories from string to list
df['categories'] = df['categories'].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Parse images from string to list and take first image
df['images'] = df['images'].apply(lambda x: literal_eval(x)[0].strip() if pd.notnull(x) and literal_eval(x) else None)

# Fill missing values
df['brand'] = df['brand'].fillna('Unknown')
df['description'] = df['description'].fillna('')
df['price'] = df['price'].fillna(df['price'].median())
df['material'] = df['material'].fillna('Unknown')
df['color'] = df['color'].fillna('Unknown')
df['country_of_origin'] = df['country_of_origin'].fillna('Unknown')

print("Data after cleaning:")
df.info()

## 3. Exploratory Data Analysis

In [None]:
# Price distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

print(f"Price statistics:")
print(df['price'].describe())

In [None]:
# Top brands
plt.figure(figsize=(12, 6))
top_brands = df['brand'].value_counts().head(10)
sns.barplot(x=top_brands.values, y=top_brands.index)
plt.title('Top 10 Brands')
plt.xlabel('Count')
plt.show()

In [None]:
# Categories analysis
all_categories = [cat for sublist in df['categories'] for cat in sublist]
category_counts = pd.Series(all_categories).value_counts().head(15)

plt.figure(figsize=(12, 8))
sns.barplot(x=category_counts.values, y=category_counts.index)
plt.title('Top 15 Categories')
plt.xlabel('Count')
plt.show()

In [None]:
# Material and color distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

material_counts = df['material'].value_counts().head(10)
sns.barplot(x=material_counts.values, y=material_counts.index, ax=ax1)
ax1.set_title('Top 10 Materials')
ax1.set_xlabel('Count')

color_counts = df['color'].value_counts().head(10)
sns.barplot(x=color_counts.values, y=color_counts.index, ax=ax2)
ax2.set_title('Top 10 Colors')
ax2.set_xlabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Price by category
df_exploded = df.explode('categories')
top_categories = df_exploded['categories'].value_counts().head(5).index
df_top_cat = df_exploded[df_exploded['categories'].isin(top_categories)]

plt.figure(figsize=(12, 6))
sns.boxplot(x='categories', y='price', data=df_top_cat)
plt.title('Price Distribution by Top Categories')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Country of origin
plt.figure(figsize=(10, 6))
country_counts = df['country_of_origin'].value_counts().head(10)
sns.barplot(x=country_counts.values, y=country_counts.index)
plt.title('Top 10 Countries of Origin')
plt.xlabel('Count')
plt.show()

## 4. Text Analysis Preparation

In [None]:
# Combine title and description for text analysis
df['combined_text'] = df['title'] + ' ' + df['description']

# Text length analysis
df['text_length'] = df['combined_text'].str.len()

plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

print(f"Text length statistics:")
print(df['text_length'].describe())

## 5. Image Analysis

In [None]:
# Check image URLs
print(f"Products with images: {df['images'].notnull().sum()}")
print(f"Total products: {len(df)}")
print(f"Percentage with images: {df['images'].notnull().sum() / len(df) * 100:.2f}%")

# Sample image URLs
print("\nSample image URLs:")
for url in df['images'].dropna().head(5):
    print(url)

## 6. Summary and Insights

In [None]:
print("Dataset Summary:")
print(f"- Total products: {len(df)}")
print(f"- Unique brands: {df['brand'].nunique()}")
print(f"- Unique categories: {len(set(all_categories))}")
print(f"- Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"- Average price: ${df['price'].mean():.2f}")
print(f"- Products with images: {df['images'].notnull().sum()} ({df['images'].notnull().sum() / len(df) * 100:.1f}%)")

print("\nKey Insights:")
print("1. Most products are in the Home & Kitchen category")
print("2. China is the most common country of origin")
print("3. Price distribution is right-skewed with most products under $100")
print("4. High percentage of products have image URLs")
print("5. Text descriptions vary significantly in length")

In [None]:
# Save cleaned dataset
df.to_csv('../data/furniture_dataset_cleaned.csv', index=False)
print("Cleaned dataset saved to ../data/furniture_dataset_cleaned.csv")