**Using Content Based Filtering to recommend similar items by:**
1. Creating user-feature matrix
1. Creating item-feature matrix
1. Measuring similraity using dot product as metric
1. Recommending top-k similar items 


**Second Approach**

Perform dimensionality reduction using PCA on user_feature and item_feature matrices

**I used first 100000 rows from transactions record**

**Limited customers to customers who bought at least two items**


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random
from skimage import io

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', chunksize=100000)
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
users = next(df)
df = users.merge(articles, on='article_id')
df = df[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']]

feature_subset = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#Choose features to build feature space
features = feature_subset
df1 = df[['customer_id', 'article_id'] + features]
dummies_df = pd.get_dummies(df1, columns=features)
dummies_df

In [None]:
minimum_items = 2
groupby_customer = dummies_df.groupby('customer_id')

l = []
cutomer_ids = []
article_ids = []
for key in groupby_customer.groups.keys():
    temp = groupby_customer.get_group(key)
    if temp.article_id.nunique() >= minimum_items:
        l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(temp.article_id.values.tolist())

In [None]:
user_feature = pd.DataFrame(l, columns = dummies_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
normalized_user_feature

In [None]:
item_feature = dummies_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
item_feature

In [None]:
scores = normalized_user_feature.dot(item_feature.T)
scores

In [None]:
def get_rcmnd(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = groupby_customer.get_group(customer_id)['article_id']
    prev_dropped = cutomer_scores.drop(customer_prev_items.values)
    ordered = prev_dropped.sort_values(ascending=False)   
    return ordered, customer_prev_items

In [None]:
def plot_prev(prev_items):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(prev_items, range(1, len(prev_items)+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 6, i)
        plt.imshow(image)

In [None]:
def plot_rcmnd(rcmnds):
    fig = plt.figure(figsize=(20, 10))
    for item, i in zip(rcmnds, range(1, k+1)):
        item = '0' + str(item)
        sub = item[:3]
        image = path + "/"+ sub + "/"+ item +".jpg"
        image = plt.imread(image)
        fig.add_subplot(1, 6, i)
        plt.imshow(image)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=100)
pca.fit(normalized_user_feature)
pca.explained_variance_ratio_.sum()

In [None]:
user_feature_pca = pd.DataFrame(pca.transform(normalized_user_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)
item_feature_pca = pd.DataFrame(pca.transform(item_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)

In [None]:
scores_pca = user_feature_pca.dot(item_feature_pca.T)

In [None]:
k = 6
customer_id = scores.index[1]
rcmnds, prev_items = get_rcmnd(customer_id, scores)
rcmnds_pca, prev_items = get_rcmnd(customer_id, scores_pca)
rcmnds = rcmnds.index.values[:k]
rcmnds_pca = rcmnds_pca.index.values[:k]
path = "../input/h-and-m-personalized-fashion-recommendations/images"

In [None]:
plot_prev(prev_items)

In [None]:
plot_rcmnd(rcmnds)

In [None]:
plot_rcmnd(rcmnds_pca)

In [None]:
pos = 2000
users_cnt = len(normalized_user_feature)
items_cnt = len(item_feature)
train_df = pd.DataFrame(columns = normalized_user_feature.columns.tolist()+item_feature.columns.tolist())
for _ in range(pos):
    idx = np.random.randint(0,users_cnt-1)
    user = normalized_user_feature.iloc[idx]
    temp = groupby_customer.get_group(normalized_user_feature.index[idx])
    if temp.article_id.nunique() >= minimum_items:
        user_items = item_feature.loc[temp.sample(frac=0.75).article_id.unique()]
        user_items = user_items.apply(lambda row:pd.concat([user,row]),axis = 'columns')
        train_df = train_df.append(user_items,ignore_index = True)
len(train_df)

In [None]:
pos_labels = pd.Series(np.ones(len(train_df)))

In [None]:
article_ids = set(article_ids)

len(article_ids)

In [None]:
neg = 60
for _ in range(neg):
    idx = np.random.randint(0,users_cnt-1)
    user = normalized_user_feature.iloc[idx]
    temp = groupby_customer.get_group(normalized_user_feature.index[idx])
    user_articles_neg = list(article_ids - set(temp.article_id.unique()))
    items_current_user = item_feature.loc[np.random.choice(user_articles_neg,len(user_articles_neg)//150)]
    items_current_user = items_current_user.apply(lambda row:pd.concat([user,row]),axis='columns')
    train_df = train_df.append(items_current_user,ignore_index = True)
    

In [None]:
neg_labels = pd.Series(np.zeros(len(train_df)-len(pos_labels)))
len(neg_labels)

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
BATCH_SIZE = 64
SHUFFLE = 100
TRAIN_PERCENT = 0.8

features = np.array(train_df)
labels = pd.concat([pos_labels,neg_labels],ignore_index = True)
full_dataset = tf.data.Dataset.from_tensor_slices((features, tf.one_hot(indices = labels,depth = 2)))
sep = int(len(features)*TRAIN_PERCENT)
train_features = features[:sep]
test_features = features[sep:]
train_labels = labels.iloc[:sep]
test_labels = labels.iloc[sep:]

train_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf.one_hot(indices = train_labels,depth = 2)))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, tf.one_hot(indices = test_labels,depth = 2)))


train_dataset = train_dataset.shuffle(SHUFFLE).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(SHUFFLE).batch(BATCH_SIZE)
full_dataset = full_dataset.shuffle(SHUFFLE).batch(BATCH_SIZE)

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(512,activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(256,activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128,activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64,activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32,activation='relu'),
        tf.keras.layers.Dense(2)
    ]
)
model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.optimizers.Adam(learning_rate=0.0001),
             metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = model.fit(train_dataset, epochs=30,validation_data = test_dataset)

In [None]:
cnt_possible_rcmnd = 3
user = normalized_user_feature.iloc[30]
for _ in range(cnt_possible_rcmnd):
    item = item_feature.iloc[np.random.randint(0,items_cnt)]
    print(item.name)
    datapoint = np.array(pd.concat([user,item],ignore_index = True))
    datapoint = np.reshape(datapoint,(1,-1))
    x = model.predict(datapoint)
    print(tf.nn.softmax(x))

In [None]:
results = model.evaluate(test_dataset)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.title('model binary accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','val'], loc='upper left')
plt.show()

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
end = pd.to_datetime(df.t_dat.max())
start = end-pd.DateOffset(days=7)
end = str(end.date())
start = str(start.date())


In [None]:
customers_to_rcmnd = df[(df.t_dat>=start) & (df.t_dat<=end)].customer_id.unique()
len(customers_to_rcmnd)

In [None]:
most_popular_items_of_the_week = df[(df.t_dat>=start) & (df.t_dat<=end)].groupby('article_id').size().sort_values(ascending=False).iloc[:7]

In [None]:
most_popular_items_of_the_week=most_popular_items_of_the_week.index.values
most_popular_items_of_the_week = list(map(str,most_popular_items_of_the_week))
most_popular_items_of_the_week

In [None]:
submission = pd.DataFrame(columns = ['customer_id','prediction'])
submission

In [None]:
for el in customers_to_rcmnd:
    if el in normalized_user_feature.index:
        cnt_possible_rcmnd = 30
        user = normalized_user_feature.loc[el]
        rcmnd = []
        for _ in range(cnt_possible_rcmnd):
            item = item_feature.iloc[np.random.randint(0,items_cnt)]
            datapoint = np.array(pd.concat([user,item],ignore_index = True))
            datapoint = np.reshape(datapoint,(1,-1))
            x = tf.nn.softmax(model.predict(datapoint))
            if x[0][1]>=0.8:
                rcmnd.append(str(item.name))
    else:
        rcmnd = most_popular_items_of_the_week
    submission = submission.append(pd.Series([el," ".join(rcmnd)],index = ['customer_id','prediction']),ignore_index = True)
        
        
                
            

In [None]:
!pwd

In [None]:
submission.to_csv('submission.csv',index=False)