# Smart Product Pricing Challenge - Experimentation Notebook

This notebook helps you explore the data and test different approaches for the pricing challenge.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re

# Set style
plt.style.use('default')
sns.set_palette("husl")

DATASET_FOLDER = 'dataset/'

## 1. Load and Explore Sample Data

In [None]:
# Load sample data for exploration
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

print(f"Sample test shape: {sample_test.shape}")
print(f"Sample output shape: {sample_out.shape}")

# Merge for analysis
sample_data = sample_test.merge(sample_out, on='sample_id')
print(f"Merged sample data shape: {sample_data.shape}")

In [None]:
# Display sample data
sample_data.head()

## 2. Price Distribution Analysis

In [None]:
# Price statistics
print("Price Statistics:")
print(sample_data['price'].describe())

# Plot price distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(sample_data['price'], bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Price Distribution')
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Frequency')

# Box plot
axes[1].boxplot(sample_data['price'])
axes[1].set_title('Price Box Plot')
axes[1].set_ylabel('Price ($)')

plt.tight_layout()
plt.show()

## 3. Text Analysis

In [None]:
def extract_text_stats(text):
    """Extract basic text statistics"""
    text = str(text)
    return {
        'length': len(text),
        'word_count': len(text.split()),
        'number_count': len(re.findall(r'\d+\.?\d*', text))
    }

# Extract text features
text_stats = sample_data['catalog_content'].apply(extract_text_stats)
text_df = pd.DataFrame(text_stats.tolist())

# Add to sample data
sample_analysis = pd.concat([sample_data, text_df], axis=1)

print("Text Statistics:")
print(text_df.describe())

In [None]:
# Correlation with price
correlations = sample_analysis[['price', 'length', 'word_count', 'number_count']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations with Price')
plt.show()

print("\nCorrelations with price:")
print(correlations['price'].sort_values(ascending=False))

## 4. Product Category Analysis

In [None]:
def categorize_product(text):
    """Simple product categorization"""
    text = str(text).lower()
    
    if any(word in text for word in ['food', 'snack', 'candy', 'sauce', 'spice']):
        return 'Food'
    elif any(word in text for word in ['cosmetic', 'beauty', 'lip', 'lotion']):
        return 'Beauty'
    elif any(word in text for word in ['cleaning', 'dish', 'soap']):
        return 'Cleaning'
    else:
        return 'Other'

# Categorize products
sample_analysis['category'] = sample_analysis['catalog_content'].apply(categorize_product)

# Category distribution
category_counts = sample_analysis['category'].value_counts()
print("Category Distribution:")
print(category_counts)

# Price by category
plt.figure(figsize=(10, 6))
sns.boxplot(data=sample_analysis, x='category', y='price')
plt.title('Price Distribution by Category')
plt.xticks(rotation=45)
plt.show()

# Average price by category
avg_price_by_category = sample_analysis.groupby('category')['price'].mean().sort_values(ascending=False)
print("\nAverage Price by Category:")
print(avg_price_by_category)

## 5. Quantity and Unit Analysis

In [None]:
def extract_quantities(text):
    """Extract quantity information"""
    text = str(text).lower()
    
    # Find numbers
    numbers = re.findall(r'\d+\.?\d*', text)
    
    # Check for units
    units = {
        'oz': 'oz' in text or 'ounce' in text,
        'lb': 'lb' in text or 'pound' in text,
        'count': 'count' in text or 'pack' in text,
        'fl_oz': 'fl oz' in text
    }
    
    return {
        'max_number': max([float(x) for x in numbers]) if numbers else 0,
        **units
    }

# Extract quantity features
quantity_features = sample_analysis['catalog_content'].apply(extract_quantities)
quantity_df = pd.DataFrame(quantity_features.tolist())

# Add to analysis
sample_analysis = pd.concat([sample_analysis, quantity_df], axis=1)

print("Quantity Analysis:")
print(f"Products with oz: {quantity_df['oz'].sum()}")
print(f"Products with count: {quantity_df['count'].sum()}")
print(f"Products with fl_oz: {quantity_df['fl_oz'].sum()}")

# Price vs max number
plt.figure(figsize=(10, 6))
plt.scatter(sample_analysis['max_number'], sample_analysis['price'], alpha=0.6)
plt.xlabel('Max Number in Description')
plt.ylabel('Price ($)')
plt.title('Price vs Maximum Number in Description')
plt.show()

# Correlation
corr = sample_analysis['max_number'].corr(sample_analysis['price'])
print(f"\nCorrelation between max_number and price: {corr:.3f}")

## 6. Test Quick Solution

In [None]:
# Test the quick solution on sample data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

def quick_test():
    """Quick test on sample data"""
    
    # Use sample data as both train and test for demonstration
    X = sample_analysis[['length', 'word_count', 'number_count', 'max_number']]
    y = sample_analysis['price']
    
    # Simple train-test split
    split_idx = int(0.7 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Predict
    predictions = model.predict(X_test_scaled)
    
    # Evaluate
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    print(f"Quick Test Results:")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    
    # Calculate SMAPE
    smape = np.mean(np.abs(predictions - y_test) / ((np.abs(y_test) + np.abs(predictions)) / 2)) * 100
    print(f"SMAPE: {smape:.2f}%")
    
    return predictions, y_test

pred, actual = quick_test()

In [None]:
# Plot predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(actual, pred, alpha=0.6)
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Predicted vs Actual Prices')
plt.show()

## 7. Next Steps

Based on this analysis, consider:

1. **Feature Engineering**: Extract more sophisticated features from text
2. **Image Features**: Add visual features using pre-trained models
3. **Advanced Models**: Try XGBoost, neural networks, or ensemble methods
4. **Text Processing**: Use more advanced NLP techniques
5. **Cross-validation**: Implement proper validation strategy

Run the full solution with: `python solution.py` or `python quick_solution.py`