# DS 5230 - Unsupervised Machine Learning and Data Mining
### Content Based Recommender Systems
#### Author - Shubhanshu Gupta

#### 1) Reading json data in PySpark and cleaning it 

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
pip install num2words
pip install wordninja

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
import findspark
findspark.init()
from google.colab import files
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import isnan, when, count, col, lit, trim, avg, ceil
from pyspark.sql.types import StringType
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql import SQLContext
import pandas as pd
from num2words import num2words
import nltk
nltk.download('punkt')
import re
from contractions import CONTRACTION_MAP
from nltk.tokenize import word_tokenize
import wordninja
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import urllib.request
from sklearn.model_selection import train_test_split
import scipy
import sklearn
import random
from PIL import Image
import requests
from io import BytesIO
from textwrap import wrap
import matplotlib.gridspec as gridspec
nltk.download('punkt')
from nltk.corpus import stopwords
import random
import pickle as pk
from gensim import corpora, models, similarities
import gensim
from ast import literal_eval
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models import KeyedVectors
from keras.applications import vgg16, resnet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.layers import GlobalMaxPooling2D
import tensorflow as tf
import keras
from keras import Model
from scipy import sparse

In [None]:
!wget https://usmlproject.s3.amazonaws.com/Electronics.json -O electronics.json
!wget https://usmlproject.s3.amazonaws.com/meta_Electronics.json -O metadata.json

In [None]:
sc = SparkSession.builder.master("local[*]").getOrCreate()

sqlContext = SQLContext(sc)
df = sqlContext.read.json("electronics.json")
df.printSchema()
final_df = df.select("asin","overall","reviewerID")
final_df.show(10)
final_df.repartition(1).write.format('com.databricks.spark.csv').save("/content/ratings.csv",header = 'true')

In [None]:
df = sqlContext.read.json("metadata.json")
df.printSchema()

In [None]:
cols_to_drop = ['also_buy','also_view','similar_item','tech1','tech2','fit','details','rank']
df = df.drop(*cols_to_drop)
dup_df = df.drop_duplicates(subset=['asin'])

In [None]:
dup_df = dup_df.withColumn("price", f.regexp_replace(f.col("price"), "[\$]", ""))
dup_df = dup_df.withColumn("price", dup_df["price"].cast(FloatType()))
dup_df = dup_df.withColumn("main_cat", f.regexp_replace(f.col("main_cat"), "&amp;", "&"))
dup_df.coalesce(1).write.format('json').save('/content/metadata.json')

### Text Preprocessing

In [None]:
class TextProcessing:
    
    unit_lst = ['g','kg','lbs','lb','oz','mm','cm','km','m','ft','in','inch','ml','kw','j','kj']
    
    ## Removing html elemnets from the descriptions and features
    def cleanhtml(self,raw_html):
        cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext
    
    # Removing punctuations from the text
    def remove_punctuations(self,text):
        text = re.sub(r'[^a-zA-z\s]', '', text)
        return text
    
    # Removing all special_characters except english alphabets with option to remove digits
    def remove_special_characters(self,text, remove_digits=False):
        pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
        text = re.sub(pattern, '', text)
        return text
    
    # Convert numbers into words
    def convert_text(self,text):   
        txt = text.split(' ')
        for i in range(len(txt)):
            if txt[i].replace('.','',1).isdigit():
                txt[i] = num2words(txt[i])
        return ' '.join(txt)
    
    # Remove single letters like x b etc.
    def remove_single_letters(self,text):
        reg = re.compile('(?:^| )[b-hj-z](?= |$)')
        text = re.sub(reg, '', text)
        return text
      
    # expandcontractions such as isn't to is not
    def expand_contractions(self,text, contraction_mapping=CONTRACTION_MAP):
    
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction
        
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
    
    # A lot of units are used so to remove common units
    def remove_units(self,text):
        tokens = word_tokenize(text)
        tokens = [token.strip() for token in tokens]
        filtered_tokens = [token for token in tokens if token not in unit_lst]     
        filtered_text = ' '.join(filtered_tokens)    
        return filtered_text
    
    # Splitting joined english words like 'themoney' to 'the money'
    def split_words(self,text):
        tokens = word_tokenize(text)
        tokens = [token.strip() for token in tokens]
        filtered_tokens = [' '.join(wordninja.split(token)) for token in tokens]        
        filtered_text = ' '.join(filtered_tokens)    
        return filtered_text
    
    def text_preprocessing(self,text):
        text = self.cleanhtml(text)
        text = self.expand_contractions(text)
        text = self.convert_text(text)
        text = self.split_words(text)
        text = self.remove_single_letters(text)
        text = self.remove_units(text)
        text = self.remove_special_characters(text,True)
        # removing extra whitespace
        text = re.sub(' +', ' ', text)
        # stripping extra space
        text = text.strip()
        text = text.lower()
        return text
    
text_processing = TextProcessing()

In [None]:
product_df = pd.read_json("metadata.json", lines=True)
product_df

In [None]:
# Since description and features are in string object so correcting it
product_df['asin'] = product_df['asin'].astype('str')
product_df['description'] = product_df['description'].apply(lambda s: s[1:-1])
product_df['feature'] = product_df['feature'].apply(lambda s: s[1:-1])

In [None]:
# Cleaning description and feature data
product_df['description'] = product_df['description'].apply(lambda text: text_processing.text_preprocessing(text))
product_df['feature'] = product_df['feature'].apply(lambda text: text_processing.text_preprocessing(text))
product_df['text'] = product_df['description'] + ' ' + product_df['feature']
product_df.to_csv('product_desc.csv')

### Exploratory Data Analysis

In [None]:
rating_df = pd.read_csv("ratings.csv")
rating_df.head()

In [None]:
# Showing distribution of ratings in the user review data
sns.countplot(x="overall", data=rating_df, palette="Set3")

In [None]:
# Top 10 category of products in electronics dataset
cat_df = product_df["main_cat"].value_counts(sort = True)[:10].to_frame()
cat_df = cat_df.reset_index()
sns.set(style="darkgrid")
g = sns.barplot(x="main_cat", y="index", data=cat_df)
for index, row in cat_df.iterrows():
  g.text(row.main_cat,index, row.main_cat, color='black', ha="left")

In [None]:
# Top 10 brands based on number of products in electronics dataset
brand_df = product_df[(product_df["brand"] != '') & (product_df["brand"] != 'Generic')]['brand'].value_counts(sort = True)[:10].to_frame()
brand_df = brand_df.reset_index()
g = sns.barplot(x="brand", y="index", data=brand_df)
for index, row in brand_df.iterrows():
  g.text(row.brand,index, row.brand, color='black', ha="left")

In [None]:
# Review per category
full_df = rating_df.set_index('asin').join(product_df.set_index('asin'))
review_per_cat = full_df.groupby("main_cat").count()
rev_df = review_per_cat.sort_values('overall', ascending=False)[:10].reset_index()
g = sns.barplot(x="overall", y="main_cat", data=rev_df)
for index, row in rev_df.iterrows():
  g.text(row.overall,index, row.overall, color='black', ha="left")

### Approach 1 : Content based Recommender based on only features and descriptions of products

In [None]:
## As the data is a lot decreasing it to manageable form for recommendations
## Keeping only top 5 categories found in EDA
products = product_df[['description','feature','text','brand','main_cat','title','image']]
products['no_of_images'] = products['image'].apply(lambda x: len(x))
temp = products["main_cat"].value_counts(sort = True)[:5].index
temp = temp.to_list()
products = products[(products['main_cat'].isin(temp))]
products = products[(products['no_of_images'] == 5)]

In [None]:
# Changing url to get full image
def change_url(x):
  lst = []
  for im in x:
    lst.append(re.sub("\.[^.]*((?=.(jpg|jpeg|png|gif)))", "", im)) 
  return lst
products['image'] = products['image'].apply(lambda x: change_url(x))

In [None]:
# Saving final product file
products.reset_index(inplace=True)
products.to_csv('amazon_product_final.csv')

In [None]:
# Keeping only rows in user which have the products in our small product catalog and keeping users with more than 5 ratings to remove cold start problem
user_df = rating_df[rating_df['asin'].isin(products['asin'])]
users_interactions_count_df = user_df.groupby(['reviewerID', 'asin']).size().groupby('reviewerID').size()
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['reviewerID']]
print(f'Users with at least 5 interactions: {len(users_with_enough_interactions_df)}')
interactions_from_selected_users_df = user_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'reviewerID',
               right_on = 'reviewerID')

In [None]:
# Saving final user data
interactions_from_selected_users_df.to_csv('amazon_user_final.csv')

### TF-IDF Feature based Recommender

In [1]:
def display_img(images,title,content):
  fig = plt.figure(figsize=(20,4))
  gs = gridspec.GridSpec(1, 6)
  gs.update(hspace=0.05, wspace=0.05)
  cnt = 0

  for i in range(len(images)):
    ax = plt.subplot(gs[i])
    cnt = cnt + 1
    response = requests.get(images[i])
    img = Image.open(BytesIO(response.content))
    ax.set_title("\n".join(wrap(title[i], 20)),{'fontsize':15})
    ax.axis('off')
    ax.text(0.5,-0.1,"\n".join(wrap(content[i], 20)), fontsize=15,transform=ax.transAxes,horizontalalignment='center',verticalalignment='center')
    ax.imshow(img)
  plt.show()

In [None]:
def content_recommender(prod_asin,topn, matrix = tfidf_matrix,model = "vectorizer"):
  product_images = []
  title = []
  content = []

  orig_product = product_features[product_features.index == prod_asin]
  product_images.append(orig_product.iloc[0]['image'])
  title.append(orig_product.iloc[0]['title'])
  content.append(f"Brand: {orig_product.iloc[0]['brand']}")

  index = indices[prod_asin]

  if model == "vectorizer":
    cosin_similarity = cosine_similarity(matrix[index:index + 1], matrix).flatten()
    sim_scores = list(enumerate(cosin_similarity))
  elif model == "lda":
    sim_scores = list(enumerate(cos_similarities[matrix[index]]))
  elif model == "word2vec":
    sim_scores = list(enumerate(cos_similarities_word[index]))
  elif model == "cnn":
    sim_scores = list(enumerate(cosSimilarities[index]))


  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:topn + 1]
  product_indices = [(indices.index[i[0]],i[1]) for i in sim_scores]
  recommendations_df = pd.DataFrame(product_indices, columns=['asin', 'recStrength'])

  items = product_features.reset_index()
  recommendations_df = recommendations_df.merge(items, how = 'left', 
                                                          left_on = 'asin', 
                                                          right_on = 'asin')[['recStrength', 'asin', 'title', 'image','brand']]
  recommendations_df = recommendations_df.sort_values('recStrength',ascending = False)

  for index, row in recommendations_df[0:5].iterrows():
        product_images.append(row['image'])
        title.append(row['title'])
        content.append(f"Brand: {row['brand']} \n Similarity: {round(row['recStrength'],3)}")

  display_img(product_images,title,content)
  return recommendations_df

In [None]:
products = pd.read_csv("amazon_product_final.csv",index_col=0, converters={'image': literal_eval})
products.set_index('asin',inplace=True)
products.head()

In [None]:
product_features = products.copy()
product_features['image'] = product_features['image'].apply(lambda x: x[0])
product_features.reset_index(inplace = True)
product_features.drop([21916,25449,37774,39105],inplace=True)
product_features.set_index('asin',inplace = True)

In [None]:
# tf-idf vectorizer
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     max_features=20000,
                     stop_words='english')
tfidf_matrix = vectorizer.fit_transform(product_features['text'].apply(lambda x: np.str_(x)))
tfidf_feature_name = vectorizer.get_feature_names()
tfidf_matrix.shape

pk.dump(vectorizer,open('tf_idf_vectorizer.pkl','wb'))

In [None]:
count = 0
ind = []
for i in product_features.index:
  ind.append(count)
  count = count + 1
indices = pd.Series(ind, index=product_features.index)
indices

In [None]:
tfidf_df = content_recommender('B01HDB1SJU',20)
tfidf_score = tfidf_df['recStrength'].values

### Count Vectorizer Feature based recommender

In [None]:
countVectorizer = CountVectorizer(analyzer='word',ngram_range=(1, 2),
                     max_features=20000,
                     stop_words='english')
count_matrix = countVectorizer.fit_transform(product_features['text'].apply(lambda x: np.str_(x)))
count_feature_name = countVectorizer.get_feature_names()
count_matrix.shape
pk.dump(countVectorizer,open('count_vectorizer.pkl','wb'))

In [None]:
count_df = content_recommender('B01HDB1SJU',20,matrix=count_matrix)
count_score = count_df['recStrength'].values

### Word2vec Model

In [None]:
!wget https://usmlproject.s3.amazonaws.com/GoogleNews-vectors-negative300.bin.gz -O GoogleNews-vectors-negative300.bin.gz

In [None]:
# Training word2vec on our corpus
corpus = []
for index,words in enumerate(product_features['text']):
  corpus.append(words.split(' '))

google_word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
# Training our corpus with Google Pretrained Model
google_model = Word2Vec(size = 300, window=5, min_count = 2, workers = -1)
google_model.build_vocab(corpus)
google_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True)
google_model.train(corpus, total_examples=google_model.corpus_count, epochs = 5)
pk.dump(google_model,open('word2vecModel.pkl','wb'))

In [None]:
word_embeddings = []

def vectors(product_features):

  # Creating a list for storing the vectors (description into vectors)
  global word_embeddings
  # word_embeddings = []

  # Reading the each book description 
  for line in product_features['text']:
    featureVec = np.zeros((300,), dtype="float32")
    nwords = 0
    for word in line.split():
      nwords += 1
      if word in google_model.wv.vocab:
        featureVec = np.add(featureVec, google_model.wv[word])
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    word_embeddings.append(featureVec)
  word_embeddings = np.array(word_embeddings)

vectors(product_features)

In [None]:
cos_similarities_word = cosine_similarity(word_embeddings, word_embeddings)

In [None]:
word2vec_df = content_recommender('B01HDB1SJU',20,model = "word2vec")
word2vec_score = word2vec_df['recStrength'].values

### LDA Feature Based Recommender

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
def convert_to_token():
    texts = []
    for text in product_features['text']:
        tokens = word_tokenize(text)
        filtered_tokens = [token for token in tokens if token not in stop_words]
        texts.append(filtered_tokens)
    return texts

In [None]:
product_text = convert_to_token()
dictionary = corpora.Dictionary(product_text)
corpus = [dictionary.doc2bow(txt) for txt in product_text]
print(f'Number of unique tokens: {len(dictionary)}') 
print(f'Number of articles:{len(corpus)}')

In [None]:
# Checking model on different number of topics
topicnums = [10,15,20,25,30,35,40,45,50]
ldamodels_bow = {}
for i in topicnums:
    random.seed(42)
    ldamodels_bow[i] = models.LdaModel(corpus, num_topics=i, random_state=42, update_every=1, passes=10, id2word=dictionary)
    pk.dump(ldamodels_bow[i],open('ldamodels_bow_'+str(i)+'.lda','wb'))    
    print(f'ldamodels_bow_{i}.lda created.')

In [None]:
# Finding similarity between different topics
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return float(len(intersection))/float(len(union))

In [None]:
lda_topics = {}
for i in topicnums:
    lda_model = pk.load(open('ldamodels_bow_'+str(i)+'.lda','rb'))
    lda_topics_string = lda_model.show_topics(i)
    lda_topics[i] = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics_string]

lda_stability = {}
for i in range(0,len(topicnums)-1):
    jacc_sims = []
    for t1,topic1 in enumerate(lda_topics[topicnums[i]]):
        sims = []
        for t2,topic2 in enumerate(lda_topics[topicnums[i+1]]):
            sims.append(jaccard_similarity(topic1,topic2))    
        jacc_sims.append(sims)    
    lda_stability[topicnums[i]] = jacc_sims

In [None]:
mean_stability = [np.array(lda_stability[i]).mean() for i in topicnums[:-1]]

with sns.axes_style("darkgrid"):
    x = topicnums[:-1]
    y = mean_stability
    plt.figure(figsize=(20,10))
    plt.plot(x,y,label='Average Overlap Between Topics')
    plt.xlim([1, 55])
    plt.ylim([0, 1])
    plt.xlabel('Number of topics')
    plt.ylabel('Average Jaccard similarity')   
    plt.title('Average Jaccard Similarity Between Topics')
    plt.legend()    
    plt.show()

In [None]:
num_topics = 25
lda_model_final = pk.load(open('ldamodels_bow_'+str(num_topics)+'.lda','rb'))

In [None]:
cos_similarities = similarities.MatrixSimilarity(lda_model_final[corpus])
corpus_lda_model = lda_model_final[corpus]
cos_similarities[corpus_lda_model[0]].shape

In [None]:
lda_df = content_recommender('B01HDB1SJU',20,matrix=corpus_lda_model,model = "lda")
lda_scores = lda_df['recStrength'].values

### Image based feature recommendations

In [None]:
!wget https://usmlproject.s3.amazonaws.com/train_images.zip
!unzip train_images.zip

In [None]:
img_width, img_height, _ = 224, 224, 3 #load_image(df.iloc[0].image).shape

# Pre-Trained Model
base_model = vgg16.VGG16(weights='imagenet', 
                      include_top=False, 
                      input_shape = (img_width, img_height, 3))
base_model.trainable = False

# Add Layer Embedding
vgg_model = keras.Sequential([
    base_model,
    GlobalMaxPooling2D()
])

vgg_model.summary()

In [None]:
def get_embedding(model, img_name):
    # Reshape
    img = image.load_img(img_path(img_name), target_size=(img_width, img_height))
    # img to Array
    x   = image.img_to_array(img)
    # Expand Dim (1, w, h)
    x   = np.expand_dims(x, axis=0)
    # Pre process Input
    x   = preprocess_input(x)
    return model.predict(x).reshape(-1)

def img_path(img):
    return "train_images/"+img

In [None]:
map_embeddings = product_features['image'].apply(lambda img: get_embedding(vgg_model, img))
df_embs_vgg = map_embeddings.apply(pd.Series)

print(df_embs_vgg.shape)
df_embs_vgg.head()

In [None]:
cosSimilarities = cosine_similarity(df_embs_vgg,df_embs_vgg)
cosSimilarities

In [None]:
vgg16_df = content_recommender('B01HDB1SJU',20, model = "cnn")
vgg16_score = vgg16_df['recStrength'].values

## Comparing cosine similarities of all models

In [None]:
cosin_distance=[]
num_results=20
cosin_distance.append(sum(tfidf_score)/num_results)
cosin_distance.append(sum(count_score)/num_results)
cosin_distance.append(sum(lda_scores)/num_results)
cosin_distance.append(sum(word2vec_score)/num_results)
cosin_distance.append(sum(cnn_score)/num_results)

x=cosin_distance
y=[]
for i in range(0,10,2):
    y.append(i)
    
objects = ('tf_idf', 'bag_of_words', 'lda','avg_w2v', 'cnn')
y_pos = np.arange(len(objects))
plt.plot(y,x)
plt.xticks(y, objects)
plt.ylabel('Cosine Distance')
plt.title('Cosine Distance Measurement')
 
plt.show()

In [None]:
### Plotting barplot
plt.bar(objects,x)

### Approach 2: Based on Item and User profiles

In [None]:
from tqdm import tqdm  
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

### Get the items which user has already interacted with
def get_items_interacted(user_id, interactions_df):
    interacted_items = interactions_from_selected_users_df.loc[user_id]['asin']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

orig = []
pred = []

### Evaluate model on different RMSE
class ModelEvaluator:

  ## Get items which user has not interacted with
  def get_not_interacted_items_sample(self, user_id, sample_size, seed=42):
      interacted_items = get_items_interacted(user_id, interactions_from_selected_users_df)
      all_items = set(product_features.index)
      non_interacted_items = all_items - interacted_items

      random.seed(seed)
      non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
      return set(non_interacted_items_sample)

  def RMSE(self,orig,pred):
    return sqrt(mean_squared_error(orig, pred))

  # calculate similarity of items for each user and each item
  def get_item_similarity(self,user_id,user_product_ids,item_id,min=0.1):
    
    if item_id in user_product_ids:
      return interactions_from_selected_users_df[(interactions_from_selected_users_df.index == user_id) & (interactions_from_selected_users_df.asin == item_id)]['overall']
    
    index = indices[item_id]
    cosin_similarity = cosine_similarity(matrix[index:index + 1], matrix).flatten()
    sim_scores = list(enumerate(cosin_similarity))
    sim_scores = [(x,y) for x,y in sim_scores if y > min]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    product_indices = [(indices.index[i[0]],i[1]) for i in sim_scores]
    recommendations_df = pd.DataFrame(product_indices, columns=['asin', 'recStrength'])
    recommendations_df = recommendations_df[recommendations_df['asin'].isin(user_product_ids)]

    sim_num = 0.0
    sim_denom = 0.0
    rating = 0.0
    try:

      if len(recommendations_df) > 0:
        for index,row in recommendations_df.iterrows():
          user_rating = interactions_from_selected_users_df[(interactions_from_selected_users_df.index == user_id) & (interactions_from_selected_users_df.asin == row['asin'])]['overall'].values[0]
          sim_num += row['recStrength'] * user_rating
          sim_denom += np.abs(row['recStrength'])

        rating = sim_num/sim_denom
    except:
      print(recommendations_df)
      print(user_id)

    return rating


  ## Evaluate recommender for each user
  def evaluate_model_for_user(self, model, user_id):

        #Getting the items in test set
      interacted_values_testset = interactions_test_df.loc[user_id]
      
      if type(interacted_values_testset['asin']) == pd.Series:
          user_interacted_items_testset = set(interacted_values_testset['asin'])
          user_rating_testset = interacted_values_testset['overall']
      else:
          user_interacted_items_testset = set([interacted_values_testset['asin']])  
          user_rating_testset = [interacted_values_testset['overall']]
      
      interacted_items_count_testset = len(user_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
      user_recs_df = model.recommend_items(user_id, 
                                               items_to_ignore=get_items_interacted(user_id, 
                                                                                    interactions_train_df), 
                                               topn=100)

      user_product_ids = interactions_train_df[interactions_train_df.index == user_id]['asin']
        #For each item the user has interacted in test set
      global orig
      global pred

      for i,item_id in enumerate(user_interacted_items_testset):
          pred_rating = self.get_item_similarity(user_id,user_product_ids.tolist(),item_id)
          orig.append(user_rating_testset[i])
          pred.append(pred_rating)
            
            
  ## Evaluate model for all the users  
  def evaluate_model(self, model):
      global orig
      global pred
      print(orig)
      user_metrics = []
      for idx, user_id in tqdm(enumerate(list(interactions_test_df.index.unique().values))):
          self.evaluate_model_for_user(model, user_id)  
      print('%d users processed' % idx)
        
      pred1 = pred.copy()
      for i,x in enumerate(pred1):
            if type(x) == pd.Series:
                pred1[i] = x.values[0]
        
      global_metrics = {'rmse':self.RMSE(orig,pred1)}   
                           
      return global_metrics
    
model_evaluator = ModelEvaluator()

In [None]:
## class to create profile of users
class UserProfile:
    
    def __init__(self):
        self.indices = indices      
        
    def get_item_profile(self,item_id): 
      idx = indices[item_id]
      item_profile = matrix[idx:idx+1]
      return item_profile

    def get_item_profiles(self,ids):
      item_profiles_list = [self.get_item_profile(x) for x in ids]
      #print(item_profiles_list)
      item_profiles = scipy.sparse.vstack(item_profiles_list)
      #print(item_profiles)
      return item_profiles

    def build_users_profile(self,user_id, interactions_indexed_df):
        interactions_user_df = interactions_indexed_df.loc[user_id]
        user_item_profiles = self.get_item_profiles(interactions_user_df['asin'])  
        user_item_strengths = np.array(interactions_user_df['overall']).reshape(-1,1)
        user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
        user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
        return user_profile_norm

    def build_users_profiles(self): 
        interactions_indexed_df = interactions_train_df[interactions_train_df['asin'] \
                                                   .isin(product_features.index)]
        user_profiles = {}
        for user_id in interactions_indexed_df.index.unique():
            user_profiles[user_id] = self.build_users_profile(user_id, interactions_indexed_df)
        return user_profiles

user_profile = UserProfile()

In [None]:
# content based recommender class
class ContentBasedRecommender:
    
    def __init__(self, matrix,items_df=None):
        self.item_ids = indices
        self.matrix = matrix
        self.items_df = items_df
        
        
    def _get_similar_items_to_user_profile(self, user_id, topn=10):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[user_id], self.matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(indices[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        sim_scores = sorted(similar_items_filtered, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[0:topn]

        product_indices = [(indices.index[i[0]],i[1]) for i in sim_scores]
        
        recommendations_df = pd.DataFrame(product_indices, columns=['asin', 'recStrength'])

        return recommendations_df
content_based_recommender_model = ContentBasedRecommender(matrix,product_features)

### Evaluating for different methods

#### 1) TF-IDF

In [None]:
matrix = tfidf_matrix
user_profiles = user_profile.build_users_profiles()
len(user_profiles)

In [None]:
model_evaluator.evaluate_model(content_based_recommender_model)

### 2) Bag Of Words

In [None]:
matrix = count_matrix
user_profiles = user_profile.build_users_profiles()
len(user_profiles)
model_evaluator.evaluate_model(content_based_recommender_model)

### 3) Average Word2Vec

In [None]:
matrix = sparse.csr_matrix(word_embeddings)
user_profiles = user_profile.build_users_profiles()
len(user_profiles)
model_evaluator.evaluate_model(content_based_recommender_model)

### 4) Transfer Learning

In [None]:
matrix = sparse.csr_matrix(df_embs)
user_profiles = user_profile.build_users_profiles()
len(user_profiles)
model_evaluator.evaluate_model(content_based_recommender_model)