In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model, load_model

# Optimize Data Types

In [5]:
# Convert to more efficient types where possible
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype={
    'article_id': 'int32',
    'product_code': 'int32',
    'product_type_no': 'int16',
    'graphical_appearance_no': 'int16',
    'colour_group_code': 'int16',
    'perceived_colour_value_id': 'int16',
    'perceived_colour_master_id': 'int16',
    'department_no': 'int16',
    'index_group_no': 'int16',
    'section_no': 'int16',
    'garment_group_no': 'int16',
    # Convert object columns with few unique values to categorical
    'prod_name': 'category',
    'product_type_name': 'category',
    'product_group_name': 'category',
    'graphical_appearance_name': 'category',
    'colour_group_name': 'category',
    'perceived_colour_value_name': 'category',
    'perceived_colour_master_name': 'category',
    'department_name': 'category',
    'index_code': 'category',
    'index_name': 'category',
    'index_group_name': 'category',
    'section_name': 'category',
    'garment_group_name': 'category'
})


In [6]:
transactions_df =  pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={
    'customer_id': 'category',  # Assuming many unique customer IDs
    'article_id': 'int32',
    'price': 'float32',
    'sales_channel_id': 'int8'
})
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])


In [7]:
customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv", dtype={
    'customer_id': 'category',
    'FN': 'float32',
    'Active': 'float32',
    'club_member_status': 'category',
    'fashion_news_frequency': 'category',
    'age': 'float32',
    'postal_code': 'category'
})


# Filter and downsize transactions data

In [8]:
# Load your transactions data here, e.g.
# transactions_df = pd.read_csv("path_to_transactions.csv")

# Step 1: Find active users
user_counts = transactions_df['customer_id'].value_counts()
active_users = user_counts[user_counts > 20].index  # Set threshold for 'active' (e.g., users with > 20 purchases)

# Step 2: Find popular items
item_counts = transactions_df['article_id'].value_counts()
popular_items = item_counts[item_counts > 50].index  # Set threshold for 'popular' (e.g., items with > 50 purchases)

# Step 3: Filter transactions_df to include only active users and popular items
filtered_transactions = transactions_df[
    (transactions_df['customer_id'].isin(active_users)) & 
    (transactions_df['article_id'].isin(popular_items))
]

# Step 4: Further downsample if necessary to reach exactly 4,000,000 rows
if len(filtered_transactions) > 4000000:
    filtered_transactions = filtered_transactions.sample(n=4000000, random_state=1)  # Randomly sample 4 million rows

# Check the result
print(f"Filtered transactions size: {len(filtered_transactions)}")


Filtered transactions size: 4000000


# Create User-Item Interactions

In [10]:
# Encode user and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

filtered_transactions['user_id_encoded'] = user_encoder.fit_transform(filtered_transactions['customer_id'])
filtered_transactions['item_id_encoded'] = item_encoder.fit_transform(filtered_transactions['article_id'])

# NCF Model

In [12]:
# Define embedding sizes
embedding_dim = 50

# User and item embeddings
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=len(user_encoder.classes_), output_dim=embedding_dim)(user_input)
item_embedding = Embedding(input_dim=len(item_encoder.classes_), output_dim=embedding_dim)(item_input)

# Flatten and concatenate
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)
concat = Concatenate()([user_vec, item_vec])

# Add dense layers
dense = Dense(128, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(dense)

# Build and compile the model
model = Model([user_input, item_input], output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
# Create the interaction labels
filtered_transactions['interaction'] = 1  # All transactions are positive interactions

# Prepare TensorFlow dataset
batch_size = 1024
train_dataset = tf.data.Dataset.from_tensor_slices((
    (filtered_transactions['user_id_encoded'].values, filtered_transactions['item_id_encoded'].values),
    filtered_transactions['interaction'].values
)).batch(batch_size).shuffle(10000)

In [15]:
# Train the model
epochs = 5
model.fit(train_dataset, epochs=epochs)

Epoch 1/5
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m816s[0m 206ms/step - accuracy: 0.9972 - loss: 0.0534
Epoch 2/5
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m804s[0m 203ms/step - accuracy: 1.0000 - loss: 2.3218e-06
Epoch 3/5
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m801s[0m 203ms/step - accuracy: 1.0000 - loss: 2.7911e-07
Epoch 4/5
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m797s[0m 202ms/step - accuracy: 1.0000 - loss: 4.5014e-08
Epoch 5/5
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m797s[0m 202ms/step - accuracy: 1.0000 - loss: 9.3867e-09


<keras.src.callbacks.history.History at 0x78e427682bc0>

In [17]:
# Save the model after training
model.save('ncf_model.h5')

In [18]:
# Load the saved model
loaded_model = load_model('ncf_model.h5')

# Now you can use the model for inference or further training

In [None]:
# Example: Predicting on new data
user_ids = [0, 1, 2]  # Replace with actual user ids
item_ids = [100, 200, 300]  # Replace with actual item ids

# Assuming user_encoder and item_encoder are the same as during training
user_input = user_encoder.transform(user_ids)
item_input = item_encoder.transform(item_ids)

# Predict using the loaded model
predictions = loaded_model.predict([user_input, item_input])

# Display predictions
print(predictions)


In [19]:
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
sample_submission_df.shape

(1371980, 2)

In [25]:
sample_submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   customer_id  1371980 non-null  object
 1   prediction   1371980 non-null  object
dtypes: object(2)
memory usage: 20.9+ MB


In [27]:
sample_submission_df = sample_submission_df.astype({'customer_id': 'category'})

In [28]:
sample_submission_df['user_id_encoded'] = user_encoder.transform(sample_submission_df['customer_id'])

ValueError: y contains previously unseen labels: '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318'