# Content-Based Recommendation Using Description of Content

In [1]:
import pandas as pd
import pickle as pickle
import numpy as np
import matplotlib as plt
from bs4 import BeautifulSoup
from html import unescape
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from surprise.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_pickle('data_nov_8.pkl')

In [3]:
data['reviewerID'].unique().shape

(815053,)

###### To get Unique item/product
explode description as it is list of single entry


In [4]:
df2 = data.explode("description").drop_duplicates(subset=["asin"])
df2.reset_index(drop=True, inplace=True)

In [5]:
df2.shape

(6576, 16)

##### create category_str column from list type column 'category', with comma separated categories

In [6]:
category_values=[]
for val in data['category']:
    category_values.append(val)

In [7]:
#remove &amp from category values
for row in category_values:
    for i in range (0,len(row)):
        row[i]=row[i].replace('&amp;','and').replace('&', 'and').replace('$amp', 'and')

In [8]:
df2['category_str'] = [','.join(map(str, l)) for l in df2['category']]

In [9]:
print(df2['category_str'])

0       Electronics,Accessories and Supplies,Audio and...
1       Electronics,Computers and Accessories,Networki...
2       Electronics,Computers and Accessories,Computer...
3       Electronics,Accessories and Supplies,Audio and...
4       Electronics,Accessories and Supplies,Audio and...
                              ...                        
6571    Electronics,Home Audio,Turntables and Accessor...
6572    Electronics,Accessories and Supplies,Audio and...
6573    Electronics,Accessories and Supplies,Audio and...
6574    Electronics,Accessories and Supplies,Audio and...
6575    Electronics,Portable Audio and Video,CB and Tw...
Name: category_str, Length: 6576, dtype: object


### Recommendation Function

In [10]:
def recommend(res, asin, num):
    print("-------")
    recs = res[asin][:num]
    indexes = [rec[1] for rec in recs]
    return df2.iloc[indexes], [rec[0] for rec in recs]

#### Trying to use description (single column) for content based filtering

In [11]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [12]:
tfidf_matrix_d = tf.fit_transform(df2['description'])

In [13]:
tfidf_matrix_d.shape

(6576, 466750)

In [14]:
cosine_similarities = linear_kernel(tfidf_matrix_d, tfidf_matrix_d) 

In [15]:
print(cosine_similarities)

[[1.         0.00103775 0.         ... 0.00526049 0.00385492 0.        ]
 [0.00103775 1.         0.         ... 0.         0.01341142 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.00526049 0.         0.         ... 1.         0.01746951 0.00127807]
 [0.00385492 0.01341142 0.         ... 0.01746951 1.         0.00281316]
 [0.         0.         0.         ... 0.00127807 0.00281316 1.        ]]


###### Sort the cosine similarities(based on description) in descending order 

In [16]:
#######computation for cosimilarity based on description#####
consine_similarity_sorted=[]
results = {}
for idx, row in df2.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    x = []
    for i in similar_indices:
        x.append((cosine_similarities[idx][i], df2.index[i]))
    consine_similarity_sorted = x
    results[row['asin']] = consine_similarity_sorted[1:]


###### Get recommendation

In [17]:
asin = '0972683275'
recommendation, scores = recommend(results, asin=asin, num=5)
print("Recommending 5  products similar to:")
df2[df2['asin'] == asin][['title', 'category_str', 'description', 'asin', 'brand']]


-------
Recommending 5  products similar to:


Unnamed: 0,title,category_str,description,asin,brand
0,"VideoSecu 24"" Long Arm TV Wall Mount Low Profi...","Electronics,Accessories and Supplies,Audio and...",The videosecu TV mount is a mounting solution ...,972683275,VideoSecu


In [18]:
print("With Scores: ")
for s in scores:
    print(s)
recommendation[['title', 'category_str', 'description', 'asin', 'brand']]

With Scores: 
0.13492369655954947
0.11827004589395142
0.10566684617437384
0.08763883846681635
0.08631794804921392


Unnamed: 0,title,category_str,description,asin,brand
676,VideoSecu Tilt Flat Screen TV Wall Mount Brack...,"Electronics,Accessories and Supplies,Audio and...","The VideoSecu TV Mount is compatible with LED,...",B000NMFCIA,VideoSecu
958,VideoSecu Computer Monitor TV Wall Mount Brack...,"Electronics,Accessories and Supplies,Audio and...",This wall mount is designed for LCD Monitor an...,B0017D0ENM,VideoSecu
3003,Mounting Dream MD2393-MX TV Wall Mount Bracket...,"Electronics,Accessories and Supplies,Mounts,Co...","Fits most of 26-55"" <br>TVs This mount fits mo...",B00KXZ79VK,Mounting Dream
6089,InstallerParts 23&quot;-42&quot; Flat TV &ndas...,"Electronics,Accessories and Supplies,Audio and...",<br><b>Compatibility</b><br> InstallerParts Fl...,B00YTZDKC2,InstallerParts
3519,VIVO TV Ceiling Mount Height Adjustable and Ti...,"Electronics,Accessories and Supplies,Audio and...",MOUNT-VC55 is a brand new TV ceiling mount fro...,B01011SZJA,VIVO


#### Trying to use category (single column) for content based filtering

In [19]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [20]:
tfid_matrix_c = tf.fit_transform(df2['category_str'])

In [21]:
cosine_similarities_cat = linear_kernel(tfid_matrix_c, tfid_matrix_c) 

In [22]:
print(cosine_similarities_cat)

[[1.         0.00941107 0.01390387 ... 0.13478387 0.16335222 0.02258274]
 [0.00941107 1.         0.04690622 ... 0.01295096 0.00936858 0.00491649]
 [0.01390387 0.04690622 1.         ... 0.01839206 0.01330462 0.00636967]
 ...
 [0.13478387 0.01295096 0.01839206 ... 1.         0.17419442 0.04743761]
 [0.16335222 0.00936858 0.01330462 ... 0.17419442 1.         0.02789585]
 [0.02258274 0.00491649 0.00636967 ... 0.04743761 0.02789585 1.        ]]


###### Sort the cosine similarities (based on category) in descending order 

In [23]:
#######computation for cosimilarity based on category#####
consine_similarity_cat_sorted=[]
results_cat = {}
for idx, row in df2.iterrows():
    similar_indices = cosine_similarities_cat[idx].argsort()[:-100:-1]
    x = []
    for i in similar_indices:
        x.append((cosine_similarities_cat[idx][i], df2.index[i]))
    consine_similarity_cat_sorted = x
    results_cat[row['asin']] = consine_similarity_cat_sorted[1:]
    

###### Get recommendation

In [24]:
asin = '0972683275'
recommendation, scores = recommend(results_cat, asin=asin, num=5)
print("Recommending 5  products similar to:")
df2[df2['asin'] == asin][['title', 'category_str', 'description', 'asin', 'brand']]


-------
Recommending 5  products similar to:


Unnamed: 0,title,category_str,description,asin,brand
0,"VideoSecu 24"" Long Arm TV Wall Mount Low Profi...","Electronics,Accessories and Supplies,Audio and...",The videosecu TV mount is a mounting solution ...,972683275,VideoSecu


In [25]:
print("With Scores: ")
for s in scores:
    print(s)
recommendation[['title', 'category_str', 'description', 'asin', 'brand']]

With Scores: 
1.0
1.0
1.0
1.0
1.0


Unnamed: 0,title,category_str,description,asin,brand
3105,Heavy Duty Dual Arm (32&quot; extension) Artic...,"Electronics,Accessories and Supplies,Audio and...","Heavy Duty Dual Arm (32"" extension) Articulati...",B00NBE8TLY,Universalmounts
3180,Fleximounts Curved TV Wall Mount Bracket for 3...,"Electronics,Accessories and Supplies,Audio and...",Specs: <br>Model: FM-CR1<br>Material: Cold-rol...,B00P5R13O0,FLEXIMOUNTS
2866,Swiveling TV Wall Mount with Two Shelves (Shelf),"Electronics,Accessories and Supplies,Audio and...",<b>TAKE BACK YOUR FLOOR SPACE </b> <br> We hav...,B00IUPHWYE,Aeon Stands and Mounts
898,Cheetah APTMM2B TV Wall Mount for 20-70&quot; ...,"Electronics,Accessories and Supplies,Audio and...","<strong>FITS 20-80"" TVs up to VESA 600 and 165...",B0012S4APK,Cheetah
958,VideoSecu Computer Monitor TV Wall Mount Brack...,"Electronics,Accessories and Supplies,Audio and...",This wall mount is designed for LCD Monitor an...,B0017D0ENM,VideoSecu


#### Trying to use Category, descripition, brand, price for content based filtering

In [26]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [27]:
df2['corpus'] = (pd.Series(df2[['category_str', 'description', 'brand', 'price']]
                          .fillna('')
                          .values.tolist())
                 .apply(lambda x: ','.join(map(str, x))))


In [28]:
tfidf_matrix_corpus=tf.fit_transform(df2['corpus'])

In [29]:
cosine_similarities_corpus=linear_kernel(tfidf_matrix_corpus,tfidf_matrix_corpus)

In [30]:
print(cosine_similarities_corpus)

[[1.00000000e+00 1.26119730e-03 1.71339968e-03 ... 8.23557021e-03
  8.77037686e-03 8.19640791e-04]
 [1.26119730e-03 1.00000000e+00 1.28481114e-02 ... 3.92137882e-04
  1.18172706e-02 1.58967482e-03]
 [1.71339968e-03 1.28481114e-02 1.00000000e+00 ... 1.79018567e-03
  9.16682398e-04 1.13170120e-03]
 ...
 [8.23557021e-03 3.92137882e-04 1.79018567e-03 ... 1.00000000e+00
  1.79654903e-02 3.47319737e-03]
 [8.77037686e-03 1.18172706e-02 9.16682398e-04 ... 1.79654903e-02
  1.00000000e+00 3.79103243e-03]
 [8.19640791e-04 1.58967482e-03 1.13170120e-03 ... 3.47319737e-03
  3.79103243e-03 1.00000000e+00]]


###### Sort the cosine similarities in descending order 

In [31]:
consine_similarity_sorted=[]
results_corpus = {}
for idx, row in df2.iterrows():
    similar_indices = cosine_similarities_corpus[idx].argsort()[:-100:-1]
    x = []
    for i in similar_indices:
        x.append((cosine_similarities_corpus[idx][i], df2.index[i]))
    consine_similarity_sorted = x
#     consine_similarity_sorted = [(cosine_similarities[idx][i], df2['asin'][i]) for i in similar_indices] 
    results_corpus[row['asin']] = consine_similarity_sorted[1:]
    

###### Get recommendation

In [32]:
asin = '0972683275'
recommendation, scores = recommend(results_corpus, asin=asin, num=5)
print("Recommending 5  products similar to:")
df2[df2['asin'] == asin][['title', 'category_str', 'description', 'asin', 'brand']]



-------
Recommending 5  products similar to:


Unnamed: 0,title,category_str,description,asin,brand
0,"VideoSecu 24"" Long Arm TV Wall Mount Low Profi...","Electronics,Accessories and Supplies,Audio and...",The videosecu TV mount is a mounting solution ...,972683275,VideoSecu


In [33]:
print("With Scores: ")
for s in scores:
    print(s)
recommendation[['title', 'category_str', 'description', 'asin', 'brand']]

With Scores: 
0.17875194218537926
0.15484639884347345
0.1463680619673356
0.13378005363446765
0.11455405239155397


Unnamed: 0,title,category_str,description,asin,brand
676,VideoSecu Tilt Flat Screen TV Wall Mount Brack...,"Electronics,Accessories and Supplies,Audio and...","The VideoSecu TV Mount is compatible with LED,...",B000NMFCIA,VideoSecu
958,VideoSecu Computer Monitor TV Wall Mount Brack...,"Electronics,Accessories and Supplies,Audio and...",This wall mount is designed for LCD Monitor an...,B0017D0ENM,VideoSecu
3049,"Mount-It TV Wall Mount Full Motion LCD, LED 4K...","Electronics,Accessories and Supplies,Audio and...",,B00M0V02FA,Mount-It!
4302,Mounting Dream Ultra Slim TV Wall Mounts Brack...,"Electronics,Accessories and Supplies,Audio and...",,B01CG5BUKK,Mounting Dream
6089,InstallerParts 23&quot;-42&quot; Flat TV &ndas...,"Electronics,Accessories and Supplies,Audio and...",<br><b>Compatibility</b><br> InstallerParts Fl...,B00YTZDKC2,InstallerParts


#### Evaluation

In [34]:
review_df = df2[['reviewerID', 'asin', 'overall']].drop_duplicates()
review_df.reset_index(drop=True, inplace=True)
pivoted_df = review_df.pivot(index='reviewerID', columns='asin', values='overall').fillna(0)
pivoted_df.columns

Index(['0972683275', '980035977X', '9876035371', '998504780X', '9985609034',
       'B000001OKY', 'B00000J0AR', 'B00000J0D2', 'B00000J0QI', 'B00000J1U5',
       ...
       'B01HGAOQZ6', 'B01HGASSU0', 'B01HGDB3V8', 'B01HGDGDEK', 'B01HGOMMD0',
       'B01HHNTOV8', 'B01HIS5IBM', 'B01HISA452', 'B01HITUTV0', 'B01HIY64XM'],
      dtype='object', name='asin', length=6576)

In [35]:
min_max_scalar = MinMaxScaler(feature_range=(1, 5))
min_max_scalar.fit(cosine_similarities_corpus)
rating_pred_matrix = min_max_scalar.transform(cosine_similarities_corpus)
rating_pred_df = pd.DataFrame(rating_pred_matrix)
rating_pred_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6566,6567,6568,6569,6570,6571,6572,6573,6574,6575
0,5.000000,1.004957,1.006475,1.028389,1.025735,1.010019,1.012771,1.047785,1.019958,1.017125,...,1.019410,1.006659,1.006478,1.007277,1.007367,1.023964,1.100207,1.032877,1.034988,1.003096
1,1.004914,5.000000,1.051018,1.002246,1.002235,1.001975,1.004525,1.088477,1.069388,1.004601,...,1.011455,1.002360,1.002296,1.002579,1.002611,1.027632,1.003070,1.001503,1.047176,1.006176
2,1.006723,1.051306,5.000000,1.011318,1.011150,1.008496,1.021104,1.477050,1.130164,1.021459,...,1.005367,1.011005,1.010705,1.012025,1.012174,1.001962,1.015451,1.007095,1.003573,1.004344
3,1.028556,1.002457,1.011238,5.000000,3.686418,1.022855,1.026856,1.026580,1.064885,1.069721,...,1.142374,1.026310,1.013623,1.015303,1.015493,1.006255,1.067725,1.094829,1.138027,1.012216
4,1.025868,1.002413,1.011036,3.686407,5.000000,1.015670,1.026387,1.026112,1.063753,1.058686,...,1.139894,1.034456,1.013385,1.037651,1.038119,1.010451,1.093690,1.109676,1.160961,1.012000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6571,1.024000,1.027710,1.001750,1.006123,1.010353,1.030313,1.008944,1.002454,1.021162,1.001698,...,1.024944,1.004647,1.014980,1.005078,1.005141,5.000000,1.080904,1.024288,1.053322,1.012905
6572,1.100304,1.003213,1.015302,1.067657,1.093655,1.029579,1.029380,1.019968,1.005752,1.048813,...,1.022909,1.056210,1.035264,1.061422,1.062185,1.080967,5.000000,1.030712,1.024049,1.035218
6573,1.032812,1.001481,1.006782,1.094601,1.109482,1.028741,1.016507,1.008957,1.002549,1.016784,...,1.033082,1.013837,1.013461,1.015121,1.015308,1.024188,1.030548,5.000000,1.071770,1.013711
6574,1.034952,1.047182,1.003288,1.137829,1.160797,1.022093,1.008392,1.004441,1.077946,1.008533,...,1.050353,1.012259,1.011926,1.013396,1.013563,1.053251,1.023913,1.071798,5.000000,1.014982


In [36]:
mean_squared_error(pivoted_df, rating_pred_df)

1.4594055349765187
