# Kaggle Sticker Sales Forecasting Competition
## Playground Series S5E1 - January 2025

### Overview
This notebook presents a solution for the Kaggle Playground Series competition on forecasting sticker sales. The goal is to predict sticker sales across different countries using various store and time-based features.

### Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set styling for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

### Load Data

In [None]:
# Read the training and test datasets
train = pd.read_csv('./input/playground-series-s5e1/train.csv')
test = pd.read_csv('./input/playground-series-s5e1/test.csv')

print("Training set shape:", train.shape)
print("Test set shape:", test.shape)

### Exploratory Data Analysis

In [None]:
# Display basic information about the training data
print("\nTraining data info:")
print(train.info())

# Show first few rows
print("\nFirst few rows of training data:")
display(train.head())

# Check for missing values
print("\nMissing values in training data:")
print(train.isnull().sum())

# Basic statistics of numerical columns
print("\nDescriptive statistics:")
display(train.describe())

### Data Visualization

In [None]:
# Create a figure with multiple subplots
plt.figure(figsize=(15, 10))

# Plot 1: Average sales by day of week
plt.subplot(2, 2, 1)
train['date'] = pd.to_datetime(train['date'])
train['dayofweek'] = train['date'].dt.dayofweek
avg_sales_by_day = train.groupby('dayofweek')['num_sold'].mean()
sns.barplot(x=avg_sales_by_day.index, y=avg_sales_by_day.values)
plt.title('Average Sales by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Sales')

# Plot 2: Sales distribution
plt.subplot(2, 2, 2)
sns.histplot(train['num_sold'], bins=50)
plt.title('Distribution of Sales')
plt.xlabel('Number Sold')
plt.ylabel('Count')

# Plot 3: Average sales by country
plt.subplot(2, 2, 3)
avg_sales_by_country = train.groupby('country')['num_sold'].mean().sort_values(ascending=False)
sns.barplot(x=avg_sales_by_country.index, y=avg_sales_by_country.values)
plt.title('Average Sales by Country')
plt.xticks(rotation=45)
plt.xlabel('Country')
plt.ylabel('Average Sales')

plt.tight_layout()
plt.show()

### Feature Engineering

In [None]:
def create_features(df):
    # Create a copy to avoid modifying original dataframe
    df = df.copy()
    
    # Convert date to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Time-based features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    
    # Seasonal features
    df['season'] = pd.cut(df['month'], 
                         bins=[0, 3, 6, 9, 12], 
                         labels=['Winter', 'Spring', 'Summer', 'Fall'])
    
    # Encode categorical variables
    le = LabelEncoder()
    categorical_cols = ['country', 'store', 'item', 'season']
    
    for col in categorical_cols:
        if col in df.columns:
            df[f'{col}_encoded'] = le.fit_transform(df[col])
    
    return df

# Apply feature engineering
train_processed = create_features(train)
test_processed = create_features(test)

# Display new features
print("New features created:")
print(train_processed.columns.tolist())

### Model Training

In [None]:
"""# Prepare features for modeling
feature_cols = ['year', 'month', 'day', 'dayofweek', 'is_weekend',
               'country_encoded', 'store_encoded', 'item_encoded']

X = train_processed[feature_cols]
y = train_processed['num_sold']

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train model
model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=8,
    num_leaves=31,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mape',
    early_stopping_rounds=50,
    verbose=100
)

# Calculate validation score
val_predictions = model.predict(X_val)
val_mape = mean_absolute_percentage_error(y_val, val_predictions)
print(f"\nValidation MAPE: {val_mape:.4f}")"""


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

train_data = pd.read_csv('./input/playground-series-s5e1/train.csv')
test_data = pd.read_csv('./input/playground-series-s5e1/test.csv')

print("Training set shape:", train_data.shape)
print("Test set shape:", test_data.shape)

train_data = train_data.dropna(subset=['num_sold'])

train_data['date'] = pd.to_datetime(train_data['date'])
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data['dayofweek'] = train_data['date'].dt.dayofweek
train_data['is_weekend'] = (train_data['dayofweek'] >= 5).astype(int)

label_encoders = {}
for col in ['country', 'store', 'product']:
    le = LabelEncoder()
    train_data[col + '_encoded'] = le.fit_transform(train_data[col])
    label_encoders[col] = le

feature_cols = ['year', 'month', 'day', 'dayofweek', 'is_weekend',
                'country_encoded', 'store_encoded', 'product_encoded']
X = train_data[feature_cols]
y = train_data['num_sold']
min_gain_to_split=0.1 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=8,
    min_gain_to_split=0.1,
    random_state=42
)


model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mape',
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

val_predictions = model.predict(X_val)
val_mape = mean_absolute_percentage_error(y_val, val_predictions)
print(f"Validation MAPE: {val_mape:.4f}")


### Feature Importance Analysis

In [None]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.show()

### Make Predictions and Create Submission

In [None]:
# Generate predictions for test set
test_predictions = model.predict(test_processed[feature_cols])

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created!")

### Next Steps for Improvement
1. Feature Engineering:
   - Add lag features for time series aspects
   - Create rolling statistics
   - Add holiday indicators
   - Include price-related features if available

2. Modeling:
   - Experiment with other algorithms (XGBoost, CatBoost)
   - Implement cross-validation
   - Add hyperparameter tuning
   - Consider ensemble methods

3. Analysis:
   - Analyze prediction errors
   - Investigate seasonal patterns
   - Study country-specific trends