In [4]:
import pandas as pd
import numpy as np

In [5]:
interest = pd.read_csv('userint.csv')
interest.head()

Unnamed: 0,id,name,interests,caption
0,1,Willard Riley,Landscape,dusk
1,2,Timmothy Hunt,Potrait,dawn
2,3,Thomas Wheeler,Wedding,felling sad
3,4,Lois Hunt,Mountains,hello
4,5,Joy Gonzales,Illustrations,Peace


In [6]:
interest_labels = set()
for s in interest['interests'].str.split(',').values:
    interest_labels = interest_labels.union(set(s))

In [7]:
def count_word(dataset, ref_col, census):
    count = dict()
    for s in census: 
        count[s] = 0
    for census_keywords in dataset[ref_col].str.split(','):        
        if type(census_keywords) == float and pd.isnull(census_keywords): 
            continue        
        for s in [s for s in census_keywords if s in census]: 
            if pd.notnull(s): 
                count[s] += 1

    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, count

In [8]:
keyword_occurences, dum = count_word(interest, 'interests', interest_labels)
keyword_occurences[:5]

[['Wedding', 13],
 ['Potrait', 13],
 ['Landscape', 13],
 ['Mountains', 13],
 ['Sports', 12]]

In [9]:
# Break up the big genre string into a string array
interest['interests'] = interest['interests'].str.split(',')
# Convert genres to string value
interest['interests'] = interest['interests'].fillna("").astype('str')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(interest['interests'])
tfidf_matrix

<184x21 sparse matrix of type '<class 'numpy.float64'>'
	with 196 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [22]:
# Build a 1-dimensional array with user names
titles = interest['caption']
indices = pd.Series(interest.index, index=interest['caption'])

In [23]:
def interests_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    interests_indices = [i[0] for i in sim_scores]
    return titles.iloc[interests_indices]

In [24]:
interests_recommendations('dawn').head(10)

20                                              hello123
31                   Don’t Study me. You won’t Graduate!
50     sat back and let things happen to them. They w...
61                      If we could only turn back time…
80                         I got back with my Ex…Box 360
91                                 nothing gold can stay
110                     When I fell for you, I fell Hard
121    I don’t know what I did to have a best friend ...
140    Do you sit in a pile of sugar? cause you got a...
151                              My mind is full of You!
Name: caption, dtype: object

In [17]:
interests_recommendations('Forgive  yes  Forget  never ').head(10)

47     Life is not a problem to be solved, but a real...
77      Life isn’t perfect…But my Hair is! #selfieaddict
107                                    happy vibes today
137    I like being myself. Maybe just slimmer, with ...
167           Block the bullshit before they infect you.
0                                                   dusk
1                                                   dawn
2                                            felling sad
3                                                  hello
4                                                  Peace
Name: caption, dtype: object

In [18]:
interests_recommendations('Let your aura reflect who you truly are').head(10)

24               Let your aura reflect who you truly are
35                          Be a Warrior, not a Worrier.
54                      Watch more sunsets than Netflix.
65                                  Escape the ordinary.
84     The worst time to have a heart attack is durin...
95                            what if you had the chance
114    If you are Mine, You are Mine. I don’t like Sh...
125                 It’s me and my Best Friend for Life!
144              Nice t-shirt. Can I talk you out of it?
155    As beautiful on the inside as I am on the outs...
Name: caption, dtype: object

In [19]:
interests_recommendations('Thick thighs and pretty eyes').head(10)

23                          Thick thighs and pretty eyes
34                                  Go wild for a while.
53     When I feel a little down, I put on my favouri...
64                              Got my coffee and donut.
83     You only drink diet soda? You must be so healthy.
94                                             glittterr
113                               It will be Always YOU.
124                             Meet my Partner in Crime
143    Kiss me if I’m wrong but Dinosaurs still exist...
154             You don’t cross my mind. You live there!
Name: caption, dtype: object

In [25]:
interests_recommendations('Love in the air').head(10)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()