In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate, Input, Dropout
from tensorflow.keras.applications import VGG16
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.neighbors import NearestNeighbors


In [2]:
# Load and preprocess dataset
styles = pd.read_csv('archive (4)/styles.csv')
images_directory = 'archive (4)/images'


In [3]:
# Randomly sample 500 images
selected_samples = styles.sample(500, random_state=42)
selected_images = selected_samples['id'].values


In [4]:
# Textual data preprocessing
# Tokenization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
text_features = tfidf_vectorizer.fit_transform(selected_samples['productDisplayName'])


In [5]:
# Image data preprocessing
def preprocess_image(image_path, target_size=(224, 224)):
    img = load_img(os.path.join(images_directory, str(image_path)) + ".jpg", target_size=target_size)
    img_array = img_to_array(img)
    img_array = preprocess_input(img_array)
    return img_array

image_data = np.array([preprocess_image(image_id) for image_id in selected_images])


In [6]:
# Define textual input layer
text_input = Input(shape=(text_features.shape[1],), name='text_input')

# Define visual input layer
visual_input = Input(shape=(224, 224, 3), name='visual_input')


In [7]:
# Textual feature extraction
text_model = Dense(128, activation='relu')(text_input)
text_model = Dropout(0.2)(text_model)


In [8]:
# Visual feature extraction
visual_model = VGG16(weights='imagenet', include_top=False)(visual_input)
visual_model = tf.keras.layers.GlobalAveragePooling2D()(visual_model)
visual_model = Dense(128, activation='relu')(visual_model)
visual_model = Dropout(0.2)(visual_model)


In [9]:
# Concatenate textual and visual features
concatenated = Concatenate()([text_model, visual_model])
output = Dense(8, activation='softmax')(concatenated)


In [10]:
# Encode labels
label_map = {label: idx for idx, label in enumerate(selected_samples['masterCategory'].unique())}
y_train_encoded = selected_samples['masterCategory'].map(label_map)


In [11]:
# Create and compile model
model = Model(inputs=[text_input, visual_input], outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [12]:
# Train the model
model.fit([text_features, image_data], y_train_encoded, epochs=10, batch_size=32)


Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 22s/step - accuracy: 0.2597 - loss: 12.9012
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m375s[0m 23s/step - accuracy: 0.4214 - loss: 1.3724
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 30s/step - accuracy: 0.4565 - loss: 1.1973
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 22s/step - accuracy: 0.5290 - loss: 1.0458
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 22s/step - accuracy: 0.5835 - loss: 0.9792
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 21s/step - accuracy: 0.7566 - loss: 0.8260
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 23s/step - accuracy: 0.7930 - loss: 0.6732
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m413s[0m 24s/step - accuracy: 0.8945 - loss: 0.4377
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x197b5b83430>

In [13]:
# Extracting features
features = model.predict([text_features, image_data])


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m397s[0m 10s/step


In [14]:
# Fit Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=5, algorithm='auto')
nn.fit(features)

In [15]:
def preprocess_input(text, image_path):
    text_features = tfidf_vectorizer.transform([text])
    image = preprocess_image(image_path)
    return text_features, np.expand_dims(image, axis=0)

def get_recommendations(text, image_path):
    text_features, image = preprocess_input(text, image_path)
    predictions = model.predict([text_features, image])
    indices = nn.kneighbors([predictions.flatten()])[1][0]
    recommendations = selected_samples.iloc[indices]
    return recommendations

In [None]:
# Example usage
text = "blue cotton shirt"
image_path = 'images/1163'
recommendations = get_recommendations(text, image_path)
print(recommendations[['id', 'productDisplayName']])
