In [74]:
import numpy as np
import pandas as pd

In [75]:
events_df = pd.read_csv('ecommerce-dataset/events.csv')
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [76]:
item_df = pd.read_csv('ecommerce-dataset/item_properties_part1.csv')
item_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [77]:
item_df = pd.concat( [item_df, pd.read_csv('ecommerce-dataset/item_properties_part2.csv')], axis=0)
item_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [78]:
# filter out the items not in events df
item_df = item_df.loc[item_df.itemid.isin(events_df.itemid)]
item_df.nunique()

timestamp         18
itemid        185246
property        1098
value        1000100
dtype: int64

In [79]:
# keep only the latest snapshot
item_df = item_df.drop_duplicates(subset=['itemid','property'], keep='last')
item_df.nunique()

timestamp        18
itemid       185246
property       1098
value        932001
dtype: int64

In [80]:
item_df.property.value_counts().to_frame().head(10)

Unnamed: 0,property
764,185246
888,185246
159,185246
112,185246
283,185246
790,185246
364,185246
categoryid,185246
available,185246
678,185243


In [81]:
item_df = item_df.loc[item_df.property.isin(['categoryid','888','790','678'])]
item_df

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
16,1435460400000,178601,790,n5400.000
40,1433646000000,152892,888,599031
42,1431831600000,125874,790,n39588.000
46,1433646000000,272201,790,n10320.000
...,...,...,...,...
9275873,1435460400000,444741,categoryid,511
9275876,1439694000000,4849,888,297765 406279 907471 88645 279913 1223352 1318567
9275877,1436670000000,147935,790,n42720.000
9275891,1432436400000,206640,790,n9600.000


In [82]:
i_df = item_df.loc[item_df.property == '888']
i_df.rename(columns={'value':'888'}, inplace = True)
i_df = i_df.drop(['property'], axis=1)
i_df.head()
i_df['888'].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


array(['599031',
       '150169 1219716 136963 442519 243135 656611 n96.000 1318853',
       '911581 794767', ...,
       '1320974 56529 237874 583354 1297729 n96.000 350726 30603 832471',
       '297765 406279 907471 88645 279913 1223352 1318567',
       '5135 1233825'], dtype=object)

In [83]:
import re
def get_eight(df):
    # extract eight
    eights = i_df['888'].values
    
    # split to list
    eights = [eight.split(' ') for eight in  eights]
    
    return eights
eights = get_eight(i_df)
eights[:10]

[['599031'],
 ['150169',
  '1219716',
  '136963',
  '442519',
  '243135',
  '656611',
  'n96.000',
  '1318853'],
 ['911581', '794767'],
 ['n9720372.000'],
 ['1226747'],
 ['1196769', 'n480.000', '95022'],
 ['488331'],
 ['955912'],
 ['730323', '945357'],
 ['n4437420.000']]

In [84]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer

max_words = 1000
def get_tfidf_matrix(title_overviews):
    '''
    args:
        title_overviews: list(list(str)), list of list of string representing each document
    return:
        tfidf_matrix: np.array, TF-IDF matrix
    '''
    # get count of the matrix
    counter = Counter(np.hstack(title_overviews))

    # create mapping from word to index, get the most frequent 10000 words    
    word2index =  {unique_word: idx for idx, (unique_word, count) in enumerate(sorted(counter.items(), key=lambda x:-x[1])) if idx < max_words}

    #initialize empty count matrix
    count_matrix = np.zeros([len(title_overviews), max_words], dtype=np.int32)
    
    # start counting terms
    for idx, title_overview in enumerate(title_overviews):
        for word in title_overview:
            if word in word2index:
                count_matrix[idx][word2index[word]] += 1
    # tfidf
    transformer = TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(count_matrix)
    
    # convert to numpy array for easier operation
    return tfidf_matrix.toarray()
tfidf_matrix = get_tfidf_matrix(eights)
tfidf_matrix.shape

(185246, 1000)

In [85]:
tfidf_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.36036903, 0.37309698, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [86]:
def get_most_similar_items(tfidf_matrix, idx, top_n):
    '''
    args:
        tfidf_matrix: np.array, TF-IDF matrix
        idx: int, index of the movie
        top_n: int, how many recommendation to be returned
    return:
        most_similar_items: np.array, a list of  item index
        
    '''
    # 1d array (max_words, )
    tfidf_matrix[idx]
    
    # 2d array (num_movies, max_words)
    tfidf_matrix
    
    # compute dot product to obtain similarity scores, output shape: (num_movies, )
    scores = np.matmul(tfidf_matrix, tfidf_matrix[idx].reshape(-1,1)).reshape(-1)

    # obtain the highest score items, flip because arg
    most_similar_items = np.flip(np.argsort(scores))
    
    # filter out the item itself
    most_similar_items = most_similar_items[most_similar_items != idx][:top_n]
    
    return most_similar_items

In [87]:
i_df.iloc[get_most_similar_items(tfidf_matrix, 152892, 10)].itemid.tolist()

[294586, 427476, 5332, 410951, 253764, 103821, 225927, 18268, 316554, 229473]