In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import warnings
# Suppress DeprecationWarning and FutureWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
def preprocess_data(train_df, test_df):
    """
    Preprocess both training and test data
    
    Parameters:
    train_df (DataFrame): Training data
    test_df (DataFrame): Test data
    
    Returns:
    tuple: Processed training data, processed test data, encoders, and scalers
    """
    # Create copies to avoid modifying original data
    train = train_df.copy()
    test = test_df.copy()
    
    # Convert dates to datetime
    train['date'] = pd.to_datetime(train['date'])
    test['date'] = pd.to_datetime(test['date'])

    
    # Initialize encoders and scalers
    family_encoder = LabelEncoder()
    scalers = {
        'sales': StandardScaler(),
        'onpromotion': StandardScaler(),
        'store_nbr': StandardScaler()
    }
    
    # Encode categorical variables 
    train['family'] = family_encoder.fit_transform(train['family'])
    test['family'] = family_encoder.transform(test['family'])
    
    # Scale numerical features
    train['sales'] = scalers['sales'].fit_transform(train[['sales']])
    
    for col in ['onpromotion']:
        train[f'{col}'] = scalers[col].fit_transform(train[[col]])
        test[f'{col}'] = scalers[col].transform(test[[col]])
    
    return train, test, family_encoder, scalers

In [13]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub = pd.read_csv("store-sales-time-series-forecasting/sample_submission.csv")
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [14]:
train, test, family_encoder, scalers = preprocess_data(train_df, test_df)

In [15]:
def create_train_sequences(train_df, time_steps=6):
    """
    Create training sequences efficiently using grouped data
    
    Parameters:
    train_df: DataFrame containing training data
    time_steps: Number of previous time steps to use (default 6)
    
    Returns:
    X_train: Array of input sequences
    y_train: Array of target values
    """
    # Sort the DataFrame by date within each group
    train_df = train_df.sort_values(['store_nbr', 'family', 'date'])
    
    # Initialize lists to store sequences
    X, y = [], []
    
    # Group by store_nbr and family
    groups = train_df.groupby(['store_nbr', 'family'])
    
    # Iterate through each group
    for _, group in tqdm(groups, desc="Creating train sequences"):
        # Convert relevant columns to numpy for faster processing
        sales = group['sales'].values
        features = group[['onpromotion', 'family','store_nbr']].values
        
        # Create sequences for this group
        for i in range(len(group) - time_steps):
            X.append(features[i:i+time_steps])
            y.append(sales[i+time_steps])
    
    return np.array(X), np.array(y)

In [16]:
x,y = create_train_sequences(train, 6)
x.shape, y.shape

Creating train sequences: 100%|███████████████████████████████████████████████████| 1782/1782 [00:08<00:00, 205.56it/s]


((2990196, 6, 3), (2990196,))

In [17]:
def create_complete_test_sequences(test_df, train_df, time_steps=6):
    """
    Create test sequences for every single row in test_df with multiple features

    Parameters:
    test_df: DataFrame containing test data
    train_df: DataFrame containing training data
    time_steps: Number of previous time steps to use (default 6)

    Returns:
    X_test: Array of input sequences matching test_df length with multiple features
    """
    # Combine train and test data, keeping only necessary columns
    combined_df = pd.concat([
        train_df[['id', 'date', 'store_nbr', 'family', 'onpromotion']],
        test_df[['id', 'date', 'store_nbr', 'family', 'onpromotion']]
    ]).sort_values(['store_nbr', 'family', 'date'])

    # Initialize array to store all sequences (with 3 features now: family, store_nbr, onpromotion)
    X_test = np.zeros((len(test_df), time_steps, 3))  # 3 features

    # Create a dictionary for faster lookups, using (store_nbr, family) as the key
    combined_dict = {}
    for (store_nbr, family), group in combined_df.groupby(['store_nbr', 'family']):
        combined_dict[(store_nbr, family)] = group

    # Iterate through test_df
    for idx, test_row in tqdm(test_df.iterrows(), total=len(test_df), desc="Creating test sequences"):
        store = test_row['store_nbr']
        family = test_row['family']
        test_date = test_row['date']

        # Get the corresponding group using (store_nbr, family) as the key
        group = combined_dict.get((store, family))

        # # Handle cases where no matching group is found (if any)
        # if group is None:
        #     continue  # or handle as needed (e.g., set sequence to zeros)

        # Find the index of the current test date in this group
        try:
            date_idx = group[group['date'] == test_date].index[0]
            group_date_idx = list(group.index).index(date_idx)
        except IndexError:
            # Handle missing date in the group (if necessary)
            print("problem with the data")

        # Create sequence for multiple features: family, store_nbr, onpromotion
        sequence = np.zeros((time_steps, 3))  # 3 features in the sequence

        for i in range(time_steps):
            if group_date_idx - (time_steps - i) >= 0:
                # Feature 1: family
                sequence[i, 0] = group.iloc[group_date_idx - (time_steps - i)]['family']
                # Feature 2: store_nbr
                sequence[i, 1] = group.iloc[group_date_idx - (time_steps - i)]['store_nbr']
                # Feature 3: onpromotion
                sequence[i, 2] = group.iloc[group_date_idx - (time_steps - i)]['onpromotion']

        # Store the sequence in X_test
        X_test[idx] = sequence

    return X_test


In [18]:
x_test = create_complete_test_sequences(test, train)

Creating test sequences: 100%|██████████████████████████████████████████████████| 28512/28512 [01:51<00:00, 254.88it/s]


In [20]:
x_test.shape

(28512, 6, 3)

In [19]:
# Model building function
def build_model(input_shape):
    """
    Build and compile the LSTM model
    
    Parameters:
    input_shape (tuple): Shape of input data
    
    Returns:
    Sequential: Compiled Keras model
    """
    model = Sequential()
    # Define the input layer explicitly
    model.add(Input(shape=input_shape))
    
    # First LSTM layer with Dropout to prevent overfitting
    model.add(LSTM(units=32, return_sequences=True))
    model.add(Dropout(0.2))
    
    # Second LSTM layer
    model.add(LSTM(units=16, return_sequences=False))
    model.add(Dropout(0.2))
    
    # Dense layer for output
    model.add(Dense(units=1))

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_logarithmic_error')

    return model

In [21]:
model = build_model((6,3))

In [22]:
model.summary()

In [23]:
def train_model(model, X_train, y_train, epochs=2, batch_size = 64):
    """
    Train the LSTM model
    
    Parameters:
    model: Compiled Keras model
    X_train: Training sequences
    y_train: Target values
    epochs (int): Number of epochs to train
    batch_size (int): Batch size for training
    validation_split (float): Fraction of data to use for validation
    
    Returns:
    History: Training history
    """
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size = batch_size,
        verbose=True
    )
    return history

In [None]:
history = train_model(model, x, y)
history

Epoch 1/2
[1m20558/46722[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m4:00[0m 9ms/step - loss: 0.0323

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
predictions_scaled = model.predict(x_test)

In [None]:
predictions = scalers['sales'].inverse_transform(predictions_scaled)
predictions = predictions.flatten()
predictions

In [None]:
sub['sales'] = predictions
sub

In [None]:
sub.to_csv('submission.csv', index = False)