# POC

In [1]:
import pandas as pd
import numpy as np

from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

from sklearn.metrics import roc_auc_score
from sklearn import preprocessing


from scipy import sparse
from scipy.sparse import coo_matrix 
from scipy.sparse import csr_matrix 

from sklearn.metrics.pairwise import cosine_similarity



# Implementation

In [2]:
from sklearn.metrics import roc_auc_score
from lightfm.evaluation import auc_score

In [3]:
# Read Long format DF
long_format_df=pd.read_csv('./data/csv/long_format_df.csv')

In [4]:
long_format_df.head()

Unnamed: 0,users,book,rating,user_id,book_id
0,Ben,The Hitchhiker's Guide To The Galaxy,5,1,1
1,Ben,Watership Down,0,1,2
2,Ben,The Five People You Meet in Heaven,0,1,3
3,Ben,Speak,0,1,4
4,Ben,I Know Why the Caged Bird Sings,0,1,5


In [5]:

reviewed_df=long_format_df[long_format_df['rating']>=0]

In [6]:
reviewed_df.head()

Unnamed: 0,users,book,rating,user_id,book_id
0,Ben,The Hitchhiker's Guide To The Galaxy,5,1,1
1,Ben,Watership Down,0,1,2
2,Ben,The Five People You Meet in Heaven,0,1,3
3,Ben,Speak,0,1,4
4,Ben,I Know Why the Caged Bird Sings,0,1,5


In [7]:
reviewed_df.head()

Unnamed: 0,users,book,rating,user_id,book_id
0,Ben,The Hitchhiker's Guide To The Galaxy,5,1,1
1,Ben,Watership Down,0,1,2
2,Ben,The Five People You Meet in Heaven,0,1,3
3,Ben,Speak,0,1,4
4,Ben,I Know Why the Caged Bird Sings,0,1,5


In [100]:


def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 1 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict
    
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
#     x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(interactions,epochs=epoch,num_threads = n_jobs)
    return model

def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True,debug=False):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))
    
    read_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > 0].index) \
								 .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in read_items]
    return_score_list = scores[0:nrec_items]
    read_items_val = list(pd.Series(read_items).apply(lambda x: item_dict[x]))
    known_items_val = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("\n Books read by {}:".format(user_id))
        counter = 1
        for i in read_items_val:
            print(str(counter) + '- ' + i)
            counter+=1
        
        print("\n Known Likes of {}:".format(user_x))
        counter = 1
        for i in known_items_val:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items of {}:".format(user_id))
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    if debug:
        print(read_items)
        print(known_items)
        print(return_score_list)
        
        
    return return_score_list
    

def sample_recommendation_item(model,interactions,item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list 


def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

In [101]:
interactions = create_interaction_matrix(df = reviewed_df,
                                         user_col = 'users',
                                         item_col = 'book_id',
                                         rating_col = 'rating',
                                         threshold = '3')
interactions.shape

(85, 55)

In [102]:
user_dict = create_user_dict(interactions=interactions)

In [103]:
{key : user_dict[key] for key in list(user_dict.keys())[0:5]}

{'Albus Dumbledore': 1, 'Alexandra': 2, 'Anna-Carol': 3, 'Apollo': 4, 'Ben': 5}

In [104]:
movies_dict = create_item_dict(df = long_format_df,
                               id_col = 'book_id',
                               name_col = 'book')

In [105]:
{key : movies_dict[key] for key in list(movies_dict.keys())[0:5]}

{1: "The Hitchhiker's Guide To The Galaxy",
 2: 'Watership Down',
 3: 'The Five People You Meet in Heaven',
 4: 'Speak',
 5: 'I Know Why the Caged Bird Sings'}

In [106]:
def _shuffle(uids, iids, data, random_state):

    shuffle_indices = np.arange(len(uids))
    random_state.shuffle(shuffle_indices)

    return (uids[shuffle_indices],
            iids[shuffle_indices],
            data[shuffle_indices])

def random_train_test_split(interactions_df,
                            test_percentage=0.10,
                            random_state=None):
    """
    Randomly split interactions between training and testing.

    This function takes an interaction set and splits it into
    two disjoint sets, a training set and a test set. Note that
    no effort is made to make sure that all items and users with
    interactions in the test set also have interactions in the
    training set; this may lead to a partial cold-start problem
    in the test set.

    Parameters
    ----------

    interactions: a scipy sparse matrix containing interactions
        The interactions to split.
    test_percentage: float, optional
        The fraction of interactions to place in the test set.
    random_state: np.random.RandomState, optional
        The random state used for the shuffle.

    Returns
    -------

    (train, test): (scipy.sparse.COOMatrix,
                    scipy.sparse.COOMatrix)
         A tuple of (train data, test data)
    """
    interactions = csr_matrix(interactions_df.values)
    if random_state is None:
        random_state = np.random.RandomState()

    interactions = interactions.tocoo()

    shape = interactions.shape
    uids, iids, data = (interactions.row,
                        interactions.col,
                        interactions.data)

    uids, iids, data = _shuffle(uids, iids, data, random_state)

    cutoff = int((1.0 - test_percentage) * len(uids))

    train_idx = slice(None, cutoff)
    test_idx = slice(cutoff, None)

    train = coo_matrix((data[train_idx],
                           (uids[train_idx],
                            iids[train_idx])),
                          shape=shape,
                          dtype=interactions.dtype)
    test = coo_matrix((data[test_idx],
                          (uids[test_idx],
                           iids[test_idx])),
                         shape=shape,
                         dtype=interactions.dtype)

    return train, test

In [107]:
train, test = random_train_test_split(interactions,test_percentage=0.2)

In [108]:
test

<85x55 sparse matrix of type '<class 'numpy.longlong'>'
	with 218 stored elements in COOrdinate format>

In [109]:
train

<85x55 sparse matrix of type '<class 'numpy.longlong'>'
	with 872 stored elements in COOrdinate format>

In [110]:
mf_model = runMF(interactions = train,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

In [111]:
mf_model

<lightfm.lightfm.LightFM at 0x7faf81dc3a90>

In [112]:
print("Train precision: %.2f" % precision_at_k(mf_model, train).mean())
print("Test precision: %.2f" % precision_at_k(mf_model, test).mean())

Train precision: 0.77
Test precision: 0.04


In [113]:
auc_train = auc_score(mf_model, train).mean()
auc_test = auc_score(mf_model, test).mean()

In [114]:
print("auc_train :", auc_train)
print("auc_test :", auc_test)

auc_train : 0.9947712
auc_test : 0.5849261


In [137]:
rec_list = sample_recommendation_user(model = mf_model, 
                                      interactions = interactions, 
                                      user_id = 'Tiffany', 
                                      user_dict = user_dict,
                                      item_dict = movies_dict, 
                                      threshold = 3,
                                      nrec_items = 5,
                                      debug=True)


 Books read by Tiffany:
1- The Chrysalids
2- The War Of The Worlds
3- Shattered
4- The Hobbit
5- The Lord of the Rings
6- Bone Series
7- Holes
8- Harry Potter Series
9- The Golden Compass
10- My Sister's Keeper
11- Hatchet
12- Eragon
13- Twilight Series
14- The Lion the Witch and the Wardrobe
15- To Kill a Mockingbird
16- Bleach (graphic novel)
17- Naruto
18- The Princess Bride
19- Lord of the Flies
20- Inkheart
21- The Great Gatsby
22- The Princess Diaries
23- The Da Vinci Code
24- The Sisterhood of the Travelling Pants
25- Speak
26- The Hitchhiker's Guide To The Galaxy

 Known Likes of 69:
1- The Chrysalids
2- The War Of The Worlds
3- Shattered
4- The Hobbit
5- The Lord of the Rings
6- Holes
7- Harry Potter Series
8- The Golden Compass
9- My Sister's Keeper
10- Hatchet
11- The Lion the Witch and the Wardrobe
12- To Kill a Mockingbird
13- The Princess Bride
14- Inkheart
15- The Great Gatsby
16- The Da Vinci Code
17- The Hitchhiker's Guide To The Galaxy

 Recommended Items of Tiffany:

In [116]:
# df filter with known likes 

In [141]:
# def build_filter_df(read_list,like_list):
#     for i in read_list:
#         if i in like_list:
#             print("(interactions[{}]>=4) &".format(i))
#         else:
#             print("(interactions[{}]<=3) &".format(i,i))
# build_filter_df([55, 53, 52, 51, 50, 47, 44, 43, 42, 41, 40, 39, 36, 32, 31, 29, 28, 20, 19, 17, 15, 11, 10, 8, 4, 1],[55, 53, 52, 51, 50, 44, 43, 42, 41, 40, 32, 31, 20, 17, 15, 10, 1])

(interactions[55]>=4) &
(interactions[53]>=4) &
(interactions[52]>=4) &
(interactions[51]>=4) &
(interactions[50]>=4) &
(interactions[47]<=3) &
(interactions[44]>=4) &
(interactions[43]>=4) &
(interactions[42]>=4) &
(interactions[41]>=4) &
(interactions[40]>=4) &
(interactions[39]<=3) &
(interactions[36]<=3) &
(interactions[32]>=4) &
(interactions[31]>=4) &
(interactions[29]<=3) &
(interactions[28]<=3) &
(interactions[20]>=4) &
(interactions[19]<=3) &
(interactions[17]>=4) &
(interactions[15]>=4) &
(interactions[11]<=3) &
(interactions[10]>=4) &
(interactions[8]<=3) &
(interactions[4]<=3) &
(interactions[1]>=4) &


In [142]:
result = interactions[
(interactions[55]>=4) &
(interactions[53]>=4) &
(interactions[52]>=4) &
(interactions[51]>=4) &
(interactions[50]>=4) &
(interactions[47]<=3) &
(interactions[44]>=4) &
(interactions[43]>=4) &
(interactions[42]>=4) &
(interactions[41]>=4) &
(interactions[40]>=4) &
(interactions[39]<=3) &
(interactions[36]<=3) &
(interactions[32]>=4) &
(interactions[31]>=4) &
(interactions[29]<=3) &
(interactions[28]<=3) &
(interactions[20]>=4) &
(interactions[19]<=3) &
(interactions[17]>=4) &
(interactions[15]>=4) &
(interactions[11]<=3) &
(interactions[10]>=4) &
(interactions[8]<=3) &
(interactions[4]<=3) &
(interactions[1]>=4)   ]

In [143]:
result[[2, 7, 14, 25, 16]]

book_id,2,7,14,25,16
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tiffany,0,0,0,0,0


In [120]:
result

book_id,1,2,3,4,5,6,7,8,9,10,...,46,47,48,49,50,51,52,53,54,55
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albus Dumbledore,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Apollo,0,0,5,0,0,0,0,4,0,0,...,0,0,0,4,0,2,0,0,0,3
Claire,5,4,0,0,0,0,0,5,0,4,...,0,0,0,0,5,5,0,3,0,3
Cust10,5,0,0,0,3,0,0,0,0,1,...,0,0,5,0,0,0,0,0,0,0
Cust5,0,2,0,0,0,0,0,0,0,0,...,0,0,4,0,5,5,0,0,5,0
Cust8,4,0,0,5,0,0,0,5,0,5,...,0,5,0,0,5,5,0,5,0,5
Dude,0,0,0,0,0,0,0,0,0,0,...,0,5,0,0,0,2,0,0,0,0
Joshua,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
McLean,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,5,0,3,0,5
Megan,5,5,0,0,0,0,0,0,0,4,...,0,0,0,0,5,5,0,5,0,0


In [121]:
interactions

book_id,1,2,3,4,5,6,7,8,9,10,...,46,47,48,49,50,51,52,53,54,55
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albus Dumbledore,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
Alexandra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Anna-Carol,0,0,4,0,0,0,0,3,0,0,...,0,0,0,0,0,4,0,0,0,0
Apollo,0,0,5,0,0,0,0,4,0,0,...,0,0,0,4,0,2,0,0,0,3
Ben,5,0,0,0,0,0,0,3,0,3,...,5,5,0,0,0,5,5,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
joe,4,0,0,0,0,0,0,0,0,4,...,0,3,0,0,3,5,0,0,0,0
ma,0,0,0,0,5,0,0,0,0,4,...,0,0,0,0,2,2,0,0,0,0
matt c,0,4,0,0,0,0,0,0,0,5,...,4,0,0,0,5,5,4,0,0,0
sage32,0,0,0,0,0,0,0,2,0,4,...,0,0,0,3,0,0,0,0,0,5
