In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

In [2]:
df_train = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e1/sample_submission.csv')

In [3]:
df = df_train.copy()

In [4]:
df.dropna(inplace=True)

In [5]:
df = df.drop(columns=['id'])
df

Unnamed: 0,date,country,store,product,num_sold
1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
5,2010-01-01,Canada,Stickers for Less,Holographic Goose,300.0
...,...,...,...,...,...
230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [6]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df

Unnamed: 0,date,country,store,product,num_sold
1,1262304000,Canada,Discount Stickers,Kaggle,973.0
2,1262304000,Canada,Discount Stickers,Kaggle Tiers,906.0
3,1262304000,Canada,Discount Stickers,Kerneler,423.0
4,1262304000,Canada,Discount Stickers,Kerneler Dark Mode,491.0
5,1262304000,Canada,Stickers for Less,Holographic Goose,300.0
...,...,...,...,...,...
230125,1483142400,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,1483142400,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,1483142400,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,1483142400,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [7]:
date_scaler = MinMaxScaler()
df['date'] = date_scaler.fit_transform(df[['date']])

df

Unnamed: 0,date,country,store,product,num_sold
1,0.0,Canada,Discount Stickers,Kaggle,973.0
2,0.0,Canada,Discount Stickers,Kaggle Tiers,906.0
3,0.0,Canada,Discount Stickers,Kerneler,423.0
4,0.0,Canada,Discount Stickers,Kerneler Dark Mode,491.0
5,0.0,Canada,Stickers for Less,Holographic Goose,300.0
...,...,...,...,...,...
230125,1.0,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,1.0,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,1.0,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,1.0,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [8]:
categorical_cols = ['country', 'store', 'product']
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

print(df.head())

   date  country  store  product  num_sold
1   0.0        0      0        1     973.0
2   0.0        0      0        2     906.0
3   0.0        0      0        3     423.0
4   0.0        0      0        4     491.0
5   0.0        0      2        0     300.0


In [9]:
X = df.drop(columns=['num_sold'])
y = df['num_sold']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [14]:
X_train.shape[1]

4

In [19]:
model = Sequential(
    [
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ]
)


In [20]:
model.compile(
    loss='mean_absolute_percentage_error',
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01),
)

In [21]:
model.fit(
    X_train, y_train, epochs=50,
)

Epoch 1/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 46.2278
Epoch 2/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 13.9701
Epoch 3/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 12.7708
Epoch 4/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 12.6280
Epoch 5/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 12.3498
Epoch 6/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 12.1912
Epoch 7/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 12.3339
Epoch 8/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 12.2393
Epoch 9/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 12.0848
Epoch 10/50
[1m5532/5532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x79b3d81638b0>

In [22]:
yhat = model.predict(X_test)

[1m1383/1383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [23]:
yhat

array([[1370.674   ],
       [1016.6443  ],
       [ 328.3687  ],
       ...,
       [  16.898098],
       [1250.903   ],
       [ 905.4142  ]], dtype=float32)

In [24]:
y_test

60922     1486.0
10862     1015.0
47898      362.0
214250       5.0
229502     586.0
           ...  
7103       555.0
70301     1792.0
109584      20.0
73627     1275.0
67949      993.0
Name: num_sold, Length: 44252, dtype: float64

In [25]:
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_test, yhat)

print(f'MAPE: {mape * 100:.2f}%')

MAPE: 11.40%


In [26]:
mape

0.11397010708121691

In [29]:
df = df_test.copy()
df = df.drop(columns=['id'])
df['date'] = pd.to_datetime(df['date'])
df['date'] = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df['date'] = date_scaler.transform(df[['date']])
df

Unnamed: 0,date,country,store,product
0,1.000391,Canada,Discount Stickers,Holographic Goose
1,1.000391,Canada,Discount Stickers,Kaggle
2,1.000391,Canada,Discount Stickers,Kaggle Tiers
3,1.000391,Canada,Discount Stickers,Kerneler
4,1.000391,Canada,Discount Stickers,Kerneler Dark Mode
...,...,...,...,...
98545,1.428404,Singapore,Premium Sticker Mart,Holographic Goose
98546,1.428404,Singapore,Premium Sticker Mart,Kaggle
98547,1.428404,Singapore,Premium Sticker Mart,Kaggle Tiers
98548,1.428404,Singapore,Premium Sticker Mart,Kerneler


In [30]:
for col in categorical_cols:
    le = encoders[col]
    df[col] = le.transform(df[col])
print(df.head())

       date  country  store  product
0  1.000391        0      0        0
1  1.000391        0      0        1
2  1.000391        0      0        2
3  1.000391        0      0        3
4  1.000391        0      0        4


In [33]:
yhat = model.predict(df)

[1m3080/3080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


In [35]:
df_sub['num_sold'] = yhat

In [36]:
df_sub.to_csv('submission.csv')