In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os


print(os.listdir("../data"))

# Any results you write to the current directory are saved as output.

['category_tree.csv', 'events.csv', 'item_properties_part1.csv', 'item_properties_part2.csv']


** Preprocessing data **
+ Remove user that appears once
+ Remove the same index
+ Lots of visitorid and itemd have the largest rating

In [28]:
def LoadData():
    category_tree = pd.read_csv("../data/category_tree.csv", header= 0)
    events = pd.read_csv("../data/events.csv", header= 0)
    item_properties_part1 = pd.read_csv("../data/item_properties_part1.csv", header= 0)
    item_properties_part2 = pd.read_csv("../data/item_properties_part2.csv", header= 0)
    item_properties_part = pd.concat([item_properties_part1, item_properties_part2])
    return category_tree, events,item_properties_part


def TransfromData(category_tree, events,item_properties_part):
    data_raw = events[['visitorid','event','itemid']]
    data = data_raw.copy()
    transfrom_rating = []
    for event in data.event:
        if(event == 'view'):
            transfrom_rating.append(1)
        if(event == 'addtocart'):
            transfrom_rating.append(2)
        if(event == 'transaction'):
            transfrom_rating.append(3)
    data['rating']= transfrom_rating
    return data[['visitorid','itemid','rating']]


def RedundantData_VisistorOnlyApper(transform_data):
    data_examining = transform_data.copy()
    visitorid_size = data_examining.groupby(['visitorid']).size().reset_index(name='Size').sort_values("visitorid")
    visitorid_only_appear = visitorid_size[visitorid_size['Size']== 1]['visitorid'].tolist()
    data_surprise_remove_only_appear = data_examining[~data_examining['visitorid'].isin(visitorid_only_appear)]
    return data_surprise_remove_only_appear



def RedundantData_DropDuplicatesFeature(data_surprise_remove_only_appear):
    drop_feature = ['visitorid','itemid','rating']
    data_surprise_drop_duplicates_3_feature = data_surprise_remove_only_appear.drop_duplicates(subset=drop_feature)
    return data_surprise_drop_duplicates_3_feature



def RedundantData_SelectMaxRating(data_surprise_drop_duplicates_3_feature):
    drop_feature = ['visitorid','itemid']
    data_examining = data_surprise_drop_duplicates_3_feature.copy()
    data_seclect_max_rating = data_examining.groupby(drop_feature).max()['rating'].reset_index()
    return data_seclect_max_rating

In [29]:
category_tree, events,item_properties_part = LoadData()
transform_data = TransfromData(category_tree, events,item_properties_part)
data_surprise_remove_only_appear = RedundantData_VisistorOnlyApper(transform_data)
data_surprise_drop_duplicates = RedundantData_DropDuplicatesFeature(data_surprise_remove_only_appear)
data_seclect_max_rating = RedundantData_SelectMaxRating(data_surprise_drop_duplicates)

In [30]:
data_seclect_max_rating.head()

Unnamed: 0,visitorid,itemid,rating
0,0,67045,1
1,0,285930,1
2,0,357564,1
3,2,216305,1
4,2,259884,1


In [31]:
data_seclect_max_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143619 entries, 0 to 1143618
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   visitorid  1143619 non-null  int64
 1   itemid     1143619 non-null  int64
 2   rating     1143619 non-null  int64
dtypes: int64(3)
memory usage: 26.2 MB


In [32]:
data = data_seclect_max_rating.copy()

In [33]:
data.head()

Unnamed: 0,visitorid,itemid,rating
0,0,67045,1
1,0,285930,1
2,0,357564,1
3,2,216305,1
4,2,259884,1


** Model building **
+ Interaction and weigth: prepare to add the model
+ Create query data: users are not continuous so they need to create a conversion

In [34]:
def ProcessData(data_form_pandas):
    data_form_pandas.sort_values(by =['visitorid','itemid'], inplace = True)
    data_form_pandas.reset_index(drop=True, inplace = True)
    return data_form_pandas
def Onehot(list_sample):
    if(list_sample != 0):
        return 1
    else:
        return 0
def CountSame(frist_list, second_list):
    return list(set(frist_list).intersection(set(second_list)))
def InteractionTransform(transform_data):
    data_numpy = np.array(transform_data)
    data_lightfm = Dataset()
    data_lightfm.fit(users= transform_data['visitorid'].unique(),items= transform_data['itemid'].unique())
    interactions, weigths = data_lightfm.build_interactions(
        (feature[0],feature[1],feature[2]) for feature in data_numpy)
    return interactions, weigths
def MergeInteraction(original_data):
    interaction, weight = InteractionTransform(original_data)
    interaction_user = interaction.row
    interaction_item = interaction.col
    original_data['transform user'] = interaction_user
    original_data['transform item'] = interaction_item
    query_data = original_data.copy()
    return query_data
def BuildLightFmModel(data_form_pandas):
    data = data_form_pandas.copy()
    model = LightFM(loss='warp')
    interaction, weight = InteractionTransform(data)
    model.fit_partial(interactions= interaction, sample_weight= weight)
    return model

In [35]:
data_train, data_test = train_test_split(data, test_size = 0.25)

** Due to too many data we only build with data_test **

In [36]:
data_test.head()

Unnamed: 0,visitorid,itemid,rating
957606,1170165,201023,1
1136142,1397787,308917,1
411006,504846,413048,1
806257,985214,215439,1
933730,1144838,90904,1


In [37]:
dataset = data_test.copy()
print(dataset.shape)

(285905, 3)


In [38]:
data_tuning = ProcessData(dataset)
print(data_tuning.head())
interactions, weights = InteractionTransform(data_tuning)
query_data = MergeInteraction(data_tuning)
model_lightfm = BuildLightFmModel(data_tuning)

   visitorid  itemid  rating
0          2  259884       1
1          2  325215       1
2         46    3780       1
3         51  198762       1
4         51  358388       1


In [39]:
query_data.head()

Unnamed: 0,visitorid,itemid,rating,transform user,transform item
0,2,259884,1,0,0
1,2,325215,1,0,1
2,46,3780,1,1,2
3,51,198762,1,2,3
4,51,358388,1,2,4


In [40]:
model_lightfm

<lightfm.lightfm.LightFM at 0x131beabfb08>

In [41]:
def PredictModel(lightfm_model, data_model, user_ids, verbose= False):
    query_data = MergeInteraction(data_model)
    query_user = query_data[query_data['visitorid'].isin(user_ids)]['transform user'].unique()
    count = 0
    original_item = []
    recommend_item = []
    item_same = []
    length_item_same = []
    for user_id in query_user:
        known_item = query_data[query_data['transform user'] == user_id]['itemid'].tolist()
        item_for_user = query_data['transform item'].unique().tolist()
        scores = lightfm_model.predict([user_id], item_for_user)
        top_items = query_data['itemid'][np.argsort(-scores)].tolist()
        if(verbose == True):
            print("User %s" % user_ids[count])
            print("     Known positives: ",known_item[:5])
            print("     Recommended: ",top_items[:5])
        count+= 1
        original_item.append(known_item[:5])
        recommend_item.append(top_items[:5])
        item_same.append(CountSame(known_item[:5], top_items[:5]))
        length_item_same.append(len(CountSame(known_item[:5], top_items[:5])))
        
    recommend = pd.DataFrame(user_ids, columns={'visitorid'})
    recommend['original item'] = original_item
    recommend['recommend item'] = recommend_item
    recommend['item same'] = item_same
    recommend['length'] = length_item_same
    recommend['one hot'] = recommend['length'].apply(lambda x: Onehot(x))
    return recommend       
def Accuracy(recommend_form_pandas):
    positive_user = recommend_form_pandas[recommend_form_pandas['length'] != 0].shape[0]
    return positive_user/recommend_form_pandas.shape[0]

In [42]:
user = dataset['visitorid'].unique().tolist()
user[:5]

[2, 46, 51, 55, 74]

In [43]:
user = query_data['visitorid'].unique().tolist()
user[:5]

[2, 46, 51, 55, 74]

In [44]:
recommend_data = PredictModel(model_lightfm, dataset, user[:5000])

In [45]:
recommend_data.head()

Unnamed: 0,visitorid,original item,recommend item,item same,length,one hot
0,2,"[259884, 325215]","[218794, 33695, 408737, 234255, 309778]",[],0,0
1,46,[3780],"[218794, 33695, 408737, 29940, 234255]",[],0,0
2,51,"[198762, 358388, 429304]","[218794, 33695, 408737, 309778, 234255]",[],0,0
3,55,[399556],"[218794, 33695, 309778, 408737, 234255]",[],0,0
4,74,[55826],"[218794, 33695, 408737, 29940, 234255]",[],0,0


In [46]:
dataset[dataset['visitorid']==553]


Unnamed: 0,visitorid,itemid,rating,transform user,transform item
90,553,104746,1,63,90


In [47]:
recommend_data[recommend_data['length'] != 0]

Unnamed: 0,visitorid,original item,recommend item,item same,length,one hot
23,230,"[234255, 395845]","[218794, 408737, 33695, 234255, 309778]",[234255],1,1
51,479,[309778],"[218794, 33695, 408737, 309778, 234255]",[309778],1,1
132,1136,"[211422, 309778, 412280]","[218794, 33695, 408737, 309778, 234255]",[309778],1,1
620,4716,[408737],"[33695, 218794, 408737, 234255, 29940]",[408737],1,1
924,6952,[218794],"[218794, 33695, 408737, 234255, 309778]",[218794],1,1
1206,8938,[234255],"[33695, 218794, 408737, 234255, 29940]",[234255],1,1
1237,9106,"[14555, 33695]","[218794, 33695, 408737, 29940, 234255]",[33695],1,1
1812,13660,"[34915, 64951, 218626, 278893, 290999]","[218794, 408737, 290999, 33695, 234255]",[290999],1,1
1840,13848,"[78883, 290999]","[33695, 408737, 218794, 234255, 290999]",[290999],1,1
3605,26998,"[231482, 234255]","[33695, 408737, 218794, 234255, 29940]",[234255],1,1


In [48]:
print("ACC : ",Accuracy(recommend_data))

ACC :  0.0022


+ The accuracy of 500 samples is too low
