In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb

# Load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Explore the data
print("Training data shape:", df_train.shape)
print("Test data shape:", df_test.shape)
print(df_train.head())
print(df_train.info())

# Check for missing values
print("Missing values in training data:\n", df_train.isnull().sum())


Training data shape: (230130, 6)
Test data shape: (98550, 5)
   id        date country              store             product  num_sold
0   0  2010-01-01  Canada  Discount Stickers   Holographic Goose       NaN
1   1  2010-01-01  Canada  Discount Stickers              Kaggle     973.0
2   2  2010-01-01  Canada  Discount Stickers        Kaggle Tiers     906.0
3   3  2010-01-01  Canada  Discount Stickers            Kerneler     423.0
4   4  2010-01-01  Canada  Discount Stickers  Kerneler Dark Mode     491.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+

In [21]:
# Fill missing `num_sold` values using grouped median imputation
df_train['num_sold'] = df_train.groupby(['store', 'product'])['num_sold'].transform(lambda x: x.fillna(x.median()))


In [22]:
# Combine train and test datasets for easier feature engineering
df_test['num_sold'] = -1  # Placeholder for the test target
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)



In [23]:

# Time-based features
df_combined['date'] = pd.to_datetime(df_combined['date'])
df_combined['day_of_week'] = df_combined['date'].dt.dayofweek
df_combined['month'] = df_combined['date'].dt.month
df_combined['year'] = df_combined['date'].dt.year
df_combined['is_weekend'] = (df_combined['day_of_week'] >= 5).astype(int)


In [24]:

# Interaction features
df_combined['country_product'] = df_combined['country'] + '_' + df_combined['product']
df_combined['store_product'] = df_combined['store'] + '_' + df_combined['product']


In [25]:
# Add cyclical encoding for time-based features (e.g., month, day_of_week)
df_combined['month_sin'] = np.sin(2 * np.pi * df_combined['month'] / 12)
df_combined['month_cos'] = np.cos(2 * np.pi * df_combined['month'] / 12)
df_combined['day_of_week_sin'] = np.sin(2 * np.pi * df_combined['day_of_week'] / 7)
df_combined['day_of_week_cos'] = np.cos(2 * np.pi * df_combined['day_of_week'] / 7)


In [26]:
# Country-specific holiday features (update with your holiday logic if available)
country_holidays = {
    'Canada': pd.to_datetime(['2010-07-01', '2010-12-25']),
    'Finland': pd.to_datetime(['2010-06-25', '2010-12-25']),
    'Italy': pd.to_datetime(['2010-08-15', '2010-12-25']),
    'Kenya': pd.to_datetime(['2010-12-12', '2010-12-25']),
    'Norway': pd.to_datetime(['2010-05-17', '2010-12-25']),
    'Singapore': pd.to_datetime(['2010-08-09', '2010-12-25']),
}

In [27]:
df_combined['is_holiday'] = 0
for country, holidays in country_holidays.items():
    df_combined.loc[df_combined['country'] == country, 'is_holiday'] = df_combined.loc[
        df_combined['country'] == country, 'date'
    ].isin(holidays).astype(int)

df_combined

Unnamed: 0,id,date,country,store,product,num_sold,day_of_week,month,year,is_weekend,country_product,store_product,month_sin,month_cos,day_of_week_sin,day_of_week_cos,is_holiday
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,104.0,4,1,2010,0,Canada_Holographic Goose,Discount Stickers_Holographic Goose,5.000000e-01,0.866025,-0.433884,-0.900969,0
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0,4,1,2010,0,Canada_Kaggle,Discount Stickers_Kaggle,5.000000e-01,0.866025,-0.433884,-0.900969,0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0,4,1,2010,0,Canada_Kaggle Tiers,Discount Stickers_Kaggle Tiers,5.000000e-01,0.866025,-0.433884,-0.900969,0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0,4,1,2010,0,Canada_Kerneler,Discount Stickers_Kerneler,5.000000e-01,0.866025,-0.433884,-0.900969,0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0,4,1,2010,0,Canada_Kerneler Dark Mode,Discount Stickers_Kerneler Dark Mode,5.000000e-01,0.866025,-0.433884,-0.900969,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328675,328675,2019-12-31,Singapore,Premium Sticker Mart,Holographic Goose,-1.0,1,12,2019,0,Singapore_Holographic Goose,Premium Sticker Mart_Holographic Goose,-2.449294e-16,1.000000,0.781831,0.623490,0
328676,328676,2019-12-31,Singapore,Premium Sticker Mart,Kaggle,-1.0,1,12,2019,0,Singapore_Kaggle,Premium Sticker Mart_Kaggle,-2.449294e-16,1.000000,0.781831,0.623490,0
328677,328677,2019-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,-1.0,1,12,2019,0,Singapore_Kaggle Tiers,Premium Sticker Mart_Kaggle Tiers,-2.449294e-16,1.000000,0.781831,0.623490,0
328678,328678,2019-12-31,Singapore,Premium Sticker Mart,Kerneler,-1.0,1,12,2019,0,Singapore_Kerneler,Premium Sticker Mart_Kerneler,-2.449294e-16,1.000000,0.781831,0.623490,0


In [28]:
from sklearn.preprocessing import LabelEncoder

# Drop unnecessary columns
df_combined = df_combined.drop(['id', 'date'], axis=1)

# Encode categorical features
le = LabelEncoder()
for col in ['country', 'store', 'product', 'country_product', 'store_product']:
    df_combined[col] = le.fit_transform(df_combined[col])

In [29]:
# Encode categorical features
le = LabelEncoder()
for col in ['country', 'store', 'product', 'country_product', 'store_product']:
    df_combined[col] = le.fit_transform(df_combined[col])

In [30]:
# Split train and test datasets
df_train = df_combined[df_combined['num_sold'] != -1]
df_test = df_combined[df_combined['num_sold'] == -1].drop('num_sold', axis=1)


In [31]:
X = df_train.drop('num_sold', axis=1)
y = df_train['num_sold']


In [32]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
# Define LightGBM model
model = lgb.LGBMRegressor(
    objective='regression',
    metric='mape',
    random_state=42,
    learning_rate=0.01,
    max_depth=7,
    num_leaves=31,
    n_estimators=1000,
    min_child_samples=20,
    reg_alpha=1.0,
    reg_lambda=1.0,
    early_stopping_rounds=50,  # Add early_stopping_rounds here
    verbosity=-1  # Set verbosity level (-1 for no output, 1 for info, 0 for warnings)
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mape'
)

In [39]:
# Validate the model
y_val_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Validation MAPE: {mape * 100:.2f}%")

Validation MAPE: 32.94%


In [40]:
test_predictions = model.predict(df_test)


In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
#from fbprophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error

# Load data
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])

test['num_sold']=0
test.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose,0
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle,0
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers,0
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler,0
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode,0


In [None]:
pip install fbprophet

In [49]:
# Handle missing values
def handle_missing_values(df):
    # Forward fill missing 'num_sold' values within each group
    group_cols = ['country', 'store', 'product']
    df['num_sold'] = df.groupby(group_cols)['num_sold'].ffill()
    return df

In [54]:
train = handle_missing_values(train)



Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [61]:
# Feature engineering
def create_features(df):
    # Date-based features
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['dayofweek'] = df.date.dt.dayofweek
    df['is_weekend'] = df.dayofweek >= 5
    df['is_month_end'] = df.date.dt.is_month_end
    
    # Group by country-store-product for temporal features
    group_cols = ['country', 'store', 'product']
    
    # Lag features
    for lag in [1, 7, 14, 30]:
        df[f'lag_{lag}'] = df.groupby(group_cols)['num_sold'].shift(lag)
    
    # Rolling features (7-day and 30-day windows)
    df['rolling_7_mean'] = df.groupby(group_cols)['num_sold'].transform(
        lambda x: x.rolling(7, min_periods=1).mean()
    )
    df['rolling_7_std'] = df.groupby(group_cols)['num_sold'].transform(
        lambda x: x.rolling(7, min_periods=1).std()
    )
    df['rolling_30_mean'] = df.groupby(group_cols)['num_sold'].transform(
        lambda x: x.rolling(30, min_periods=1).mean()
    )
    df['rolling_30_std'] = df.groupby(group_cols)['num_sold'].transform(
        lambda x: x.rolling(30, min_periods=1).std()
    )
    
    return df

In [62]:
# Process both train and test
train = create_features(train)
test = create_features(test)


In [63]:
# Encode categorical variables
cat_cols = ['country', 'store', 'product']
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

In [64]:
# Drop rows with missing lag features (from feature engineering)
train = train.dropna()


In [65]:
# Split features
X = train.drop(['id', 'date', 'num_sold'], axis=1)
y = train['num_sold']
X_test = test.drop(['id', 'date'], axis=1)

In [76]:
# LightGBM model with grouped cross-validation
lgb_params = {
    'objective': 'mape',
    'metric': 'mape',
    'n_estimators': 2000,
    'learning_rate': 0.05,
    'num_leaves': 50,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'random_state': 42,
    'early_stopping_rounds': 100,  # Add early stopping here
    'verbose': -1  # Suppress LightGBM logs
}

# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
best_iterations = []
scores = []


In [77]:

for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMRegressor(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)]
    )
    best_iterations.append(model.best_iteration_)
    val_pred = model.predict(X_val)
    score = mean_absolute_percentage_error(y_val, val_pred)
    scores.append(score)
    print(f"Fold MAPE: {score:.4f}")

print(f"Average MAPE: {np.mean(scores):.4f}")


Fold MAPE: 0.1254
Fold MAPE: 0.1065
Fold MAPE: 0.0966
Fold MAPE: 0.0838
Fold MAPE: 0.0756
Average MAPE: 0.0976


In [79]:

# Ensure best_iterations contains valid values
best_iterations = [max(1, x) for x in best_iterations]  # Ensure no value is 0

# Retrain on full data with average best iteration
final_model = LGBMRegressor(
    **{**lgb_params, 'n_estimators': int(np.mean(best_iterations)), 'early_stopping_rounds': None}  # Disable early stopping
)
final_model.fit(X, y)


In [85]:
X_test = test.drop(['id','date','num_sold'], axis=1)


In [86]:
X_test

Unnamed: 0,country,store,product,year,month,day,dayofweek,is_weekend,is_month_end,lag_1,lag_7,lag_14,lag_30,rolling_7_mean,rolling_7_std,rolling_30_mean,rolling_30_std
0,0,0,0,2017,1,1,6,True,False,,,,,0.0,,0.0,
1,0,0,1,2017,1,1,6,True,False,,,,,0.0,,0.0,
2,0,0,2,2017,1,1,6,True,False,,,,,0.0,,0.0,
3,0,0,3,2017,1,1,6,True,False,,,,,0.0,,0.0,
4,0,0,4,2017,1,1,6,True,False,,,,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98545,5,1,0,2019,12,31,1,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98546,5,1,1,2019,12,31,1,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98547,5,1,2,2019,12,31,1,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98548,5,1,3,2019,12,31,1,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
X_train

Unnamed: 0,country,store,product,year,month,day,dayofweek,is_weekend,is_month_end,lag_1,lag_7,lag_14,lag_30,rolling_7_mean,rolling_7_std,rolling_30_mean,rolling_30_std
2701,0,0,1,2010,1,31,6,True,True,620.0,811.0,727.0,973.0,634.428571,54.082829,676.866667,98.275381
2702,0,0,2,2010,1,31,6,True,True,624.0,714.0,628.0,906.0,573.857143,44.749036,607.500000,83.632921
2703,0,0,3,2010,1,31,6,True,True,313.0,340.0,319.0,423.0,264.714286,34.581856,294.000000,48.003592
2704,0,0,4,2010,1,31,6,True,True,373.0,427.0,442.0,491.0,368.142857,19.003759,377.366667,42.093069
2705,0,2,0,2010,1,31,6,True,True,219.0,223.0,249.0,300.0,220.500000,23.444971,229.850000,25.676684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192452,2,0,2,2015,11,9,0,False,False,458.0,365.0,353.0,403.0,416.285714,46.381441,403.200000,42.475470
192453,2,0,3,2015,11,9,0,False,False,271.0,215.0,219.0,247.0,243.142857,20.144715,239.700000,21.471954
192454,2,0,4,2015,11,9,0,False,False,243.0,190.0,192.0,217.0,205.142857,23.596812,193.766667,19.732143
192455,2,2,0,2015,11,9,0,False,False,173.0,132.0,124.0,140.0,145.142857,17.228161,137.533333,14.090186


In [88]:

# Generate predictions
test_pred = final_model.predict(X_test)


In [89]:

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': test_pred.clip(0)  # Ensure non-negative predictions
})
submission.to_csv('submission222.csv', index=False)
