In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [4]:
sample_data = pd.read_csv("./sample-data.csv")
sample_data.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [5]:
tfvectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tfvectorizer.fit_transform(sample_data['description'])

In [6]:
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfvectorizer.get_feature_names())
print(df)

     000  000 feet  000 feet chasm  000 feet form  000 miles  \
0    0.0       0.0             0.0            0.0        0.0   
1    0.0       0.0             0.0            0.0        0.0   
2    0.0       0.0             0.0            0.0        0.0   
3    0.0       0.0             0.0            0.0        0.0   
4    0.0       0.0             0.0            0.0        0.0   
..   ...       ...             ...            ...        ...   
495  0.0       0.0             0.0            0.0        0.0   
496  0.0       0.0             0.0            0.0        0.0   
497  0.0       0.0             0.0            0.0        0.0   
498  0.0       0.0             0.0            0.0        0.0   
499  0.0       0.0             0.0            0.0        0.0   

     000 miles annual   03  03 oz  03 oz br        10  ...  zones high  \
0                 0.0  0.0    0.0       0.0  0.000000  ...         0.0   
1                 0.0  0.0    0.0       0.0  0.000000  ...         0.0   
2        

In [16]:
len(tfvectorizer.get_feature_names())

52262

In [10]:
tfidf_matrix[0]

<1x52262 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [7]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in sample_data.iterrows():
   similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
   similar_items = [(cosine_similarities[idx][i], sample_data['id'][i]) for i in similar_indices] 
   results[row['id']] = similar_items[1:]

In [8]:
def item(id):  
  return sample_data.loc[sample_data['id'] == id]['description'].tolist()[0].split(' - ')[0] 
# Just reads the results out of the dictionary.def 
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")   
    print("-------")    
    recs = results[item_id][:num]   
    for rec in recs: 
       print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [9]:
recommend(item_id=11, num=5)

Recommending 5 products similar to Baby sunshade top...
-------
Recommended: Sunshade hoody (score:0.21330296021085024)
Recommended: Baby baggies apron dress (score:0.10975311296284812)
Recommended: Runshade t-shirt (score:0.09988151262780731)
Recommended: Runshade t-shirt (score:0.09530698241688207)
Recommended: Runshade top (score:0.08510550093018411)


In [17]:
cosine_similarities

array([[1.        , 0.10110642, 0.06487353, ..., 0.06097409, 0.06546914,
        0.06955608],
       [0.10110642, 1.        , 0.4181664 , ..., 0.03550042, 0.06936414,
        0.06480538],
       [0.06487353, 0.4181664 , 1.        , ..., 0.03402428, 0.0455137 ,
        0.05038512],
       ...,
       [0.06097409, 0.03550042, 0.03402428, ..., 1.        , 0.04187121,
        0.04958298],
       [0.06546914, 0.06936414, 0.0455137 , ..., 0.04187121, 1.        ,
        0.36281626],
       [0.06955608, 0.06480538, 0.05038512, ..., 0.04958298, 0.36281626,
        1.        ]])