<a href="https://colab.research.google.com/github/rase26/Alphacamp_Recommendation/blob/main/W2_A4_content_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-02 13:20:10--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv.1’


2022-01-02 13:20:10 (20.1 MB/s) - ‘All_Beauty.csv.1’ saved [15499476/15499476]

--2022-01-02 13:20:11--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz.1’


2022-01-02 13:20:11 (15.1 MB/s) - ‘meta_All_Beauty.json.gz.1’ saved [10329961/10329961]



In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [None]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [4]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [5]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 計算商品用標題所表示的 tfidf 矩陣
df = metadata.drop_duplicates('title')
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df['title'])

# 計算商品間的相似程度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
mapping = pd.Series(df.index,index = df['title'])




In [None]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [None]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    recommendations = {user: recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist(), k) for user in users}
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

# Use Glove embedding 

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### load glove model 

In [8]:
import numpy as np

def load_emb_model(emb_path):

  emb_dict={}
  with open(emb_path) as fh:
    for line in fh:
      word,emb=line.strip().split(' ',1)
      emb_dict[word]=np.array([float(i) for i in emb.split(' ')])
  return emb_dict

In [10]:
glove_emb_path='/content/gdrive/My Drive/Colab_Notebooks/NLP/glove.42b.300d/glove.42B.300d.txt'
glove_emb_dict=load_emb_model(glove_emb_path)

### text preprocess

#### function

In [11]:
# 匯入停用字字典 變成 stopWords list

def load_stopword(filepath):
  stopWords=[]
  with open(filepath, 'r', encoding='utf8') as f: 
      for data in f.readlines():
          data = data.strip()
          stopWords.append(data)

  #由於一些符號 直接在txt檔輸入 無法被辨識出來 所以直接append到stopWords上 

  extra=['：','，','！','(','、','）','》','《','　','「','－','.  ','★',' & ','…','／',' ','\r\n','.','\r','\t','','●','or','of','. ','•','【','】','?',
         '（','0','1','2','3','4','5','6','7','8','9','。','#','」','？','．','『','』','；','—','‧',')','Ü',':',',','“','”','/','⋯','─','!','-','→','→',
         'A','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
         'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

  stopWords.extend(extra)
  
  return stopWords

#alphabet=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

# load punctuation list
def load_punctuation(filepath):
  
  punctuation=[]

  with open(filepath, 'r', encoding='utf8') as f: 
      for data in f.readlines():
          data = data.strip()
          punctuation.append(data)
  return punctuation

In [12]:
import re

def en_text_preprocess(text,stopWord=True,punctuation=True):
  
  tt=[]

  if type(text)== str:
    
    for t in re.split(r'[-（）//()"““@%?!®＆&*#_:;,.\s]\s*',text.lower().strip()):   
        t= t.strip("'s")
        t=t.strip('()_"').split(' ')
        if stopWord == True:
          t= filter(lambda a: a not in en_stopWords,t)
        if punctuation == True:
          t= filter(lambda a: a not in punctuation_list,t)
          
        t=list(t)

        for w,word in enumerate(t):
          
          if 'http' in word:
            #print('word in t:',word)
            t.pop(w)

        tt.extend(t)
  
  tt=list(filter(None,tt))
  
  return tt
  

In [28]:
def eng_preprocess_textlist(eng_text_list,stopWord=True,punctuation=True):

  eng_process=[]

  for i,text in enumerate(eng_text_list):
    tt=en_text_preprocess(text,stopWord,punctuation)
    eng_process.append(tt)
    # print('NO. {}, {} \n=>list: {}'.format(i,text,tt))
  
  return eng_process
  

In [14]:
def save_list_txt(list,savefilepath):
  with  open(savefilepath, 'w') as f:
      for item in list:
        f.write("%s\n" % item)
      print('已存成txt:', savefilepath)
  

#### work

In [16]:
stopWord_path='/content/gdrive/My Drive/Colab_Notebooks/NLP/english_stopword.txt'
en_stopWords=load_stopword(stopWord_path)
punctuation_file='/content/gdrive/My Drive/Colab_Notebooks/NLP/punctuation.txt'
punctuation_list=load_punctuation(punctuation_file)

In [33]:
prod_df=metadata[['asin','title']].drop_duplicates()
prod_list=prod_df.title.to_list()

In [34]:
prod_list_preprocess=eng_preprocess_textlist(prod_list)

In [44]:
savefolder='/content/gdrive/MyDrive/Colab_Notebooks/Colab Notebooks/Alphacamp_dataai/'
savefilepath= savefolder+'prod_preprocess.txt'
save_list_txt(prod_list_preprocess,savefilepath)

已存成txt: /content/gdrive/MyDrive/Colab_Notebooks/Colab Notebooks/Alphacamp_dataai/prod_preprocess.txt


### text embedding

#### function

In [10]:
import numpy as np
#encode txt
#char_emb:embedding dict
def char_emb_text(char_emb,text):
  txt_seq=[]
  notexistword=[]
  for char in text:
    #encode char
    if char in char_emb:
      encode_txt=char_emb[char]
      # store
      txt_seq.append(encode_txt)  
    else:
      print(char,'not exist')
      notexistword.append(char)
    
  txt_seq=np.array(txt_seq)
  #計算txt_seq平均
  avg_txt_seq=np.mean(txt_seq,axis=0)
  
  return avg_txt_seq,notexistword

  print('num of char: {}, num of vector: {}'.format(len(txt_seq),len(txt_seq[0])))
  print('avg_txt_seq:{}\nshape: {}'.format(avg_txt_seq,avg_txt_seq.shape))

In [11]:
import csv
def char_emb_text_list(char_emb,raw_text_list):

  ask_all_seq=[]
  notexistword_list=[]

  for i,raw_text in enumerate(raw_text_list):
    
    # print('NO.{} text:{} '.format(i,raw_text))
    
    avg_txt_seq,notexistword=char_emb_text(char_emb,raw_text)
    
    # if not isinstance(avg_txt_seq,np.float64):
    #   print('notexistword_list: {} \n處理後 len of seq: {}'.format(notexistword,len(avg_txt_seq)))
    # else:
    #   print('notexistword_list: {} \n處理後 沒有seq'.format(notexistword))
    
    ask_all_seq.append(avg_txt_seq)
    notexistword_list.extend(notexistword)
  # print('num of ask_all_seq:{}, len of each one:{}'.format(len(ask_all_seq),len(ask_all_seq[0])) )
  
  return ask_all_seq,set(notexistword_list)

#### work

In [45]:
save_emb_path= savefolder+'prod_seq.txt'
save_noexist_path=savefolder+'prod_noexistword.txt'
save_list_txt(prod_seq,save_emb_path)
save_list_txt(notexistword_list,save_noexist_path)

已存成txt: /content/gdrive/MyDrive/Colab_Notebooks/Colab Notebooks/Alphacamp_dataai/prod_seq.txt
已存成txt: /content/gdrive/MyDrive/Colab_Notebooks/Colab Notebooks/Alphacamp_dataai/prod_noexistword.txt


In [39]:
prodseq_df=pd.DataFrame({ 'asin':prod_df["asin"].to_list(),
    'title':prod_df["title"].to_list(),
             'prod_seq':prod_seq,
              
             })

In [42]:
prodseq_df.index[prodseq_df['prod_seq'].isnull()]

Int64Index([  531,  1081,  1160,  1570,  3109,  3252,  3594,  4245,  4918,
             4985,  5462,  5891,  6364,  6750,  7814,  7957,  8095,  9384,
            10508, 11360, 11367, 12825, 13744, 13936, 14147, 14530, 14965,
            16409, 19450, 19944, 21211, 21803, 21863, 24614, 26613, 27245,
            27279, 29015, 29624, 32343],
           dtype='int64')

In [43]:
prodseq_df=prodseq_df.dropna(subset=['prod_seq'])

In [47]:
prodseq_df.to_csv(savefolder+'seq_df.csv',index=False)

#### load seq df

In [8]:
def load_seq_df(filepath):
  
  seq_df=pd.read_csv(filepath)

  en_p_seq=[]
  i=0
  for seq in seq_df['prod_seq']:
    n=re.split(r'[\s]\s*',seq.strip('[]'))
    n=list(filter(None,n))
    print(i,len(n))
    v=np.array([float(i) for i in n])
    print('len of seq: {}'.format(len(v)))
    en_p_seq.append(v)
    i+=1

  en_p_seq=np.array(en_p_seq)
  print('type of seq: {}, len of seq: {}'.format(type(en_p_seq),len(en_p_seq)))
  
  return seq_df,en_p_seq


In [None]:
folder='/content/gdrive/MyDrive/Colab_Notebooks/Colab Notebooks/Alphacamp_dataai/'
seq_path=folder+'seq_df.csv'
seq_df,en_p_seq=load_seq_df(seq_path)

In [21]:
en_p_seq

array([[ 0.11207333, -0.07762433,  0.02542933, ...,  0.08056267,
         0.07792017, -0.232519  ],
       [-0.12912567,  0.06618182,  0.09260178, ..., -0.21888422,
        -0.16878589,  0.06405019],
       [ 0.00107314, -0.21919   ,  0.13568571, ..., -0.12807429,
         0.03458129, -0.00842443],
       ...,
       [-0.01849133,  0.16797755,  0.21168117, ...,  0.05627833,
         0.0180615 , -0.05004   ],
       [ 0.05263814,  0.20173257, -0.17663329, ...,  0.06288434,
         0.02189086,  0.07119371],
       [ 0.07370167, -0.02320561, -0.02638671, ..., -0.27898032,
         0.07241095, -0.1549929 ]])

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(en_p_seq)


In [23]:
seq_df.head()

Unnamed: 0,asin,title,prod_seq
0,6546546450,Loud 'N Clear&trade; Personal Sound Amplifier,[ 1.12073333e-01 -7.76243333e-02 2.54293333e-...
1,7178680776,No7 Lift &amp; Luminate Triple Action Serum 50...,[-1.29125667e-01 6.61818222e-02 9.26017778e-...
2,7250468162,No7 Stay Perfect Foundation Cool Vanilla by No7,[ 1.07314286e-03 -2.19190000e-01 1.35685714e-...
3,7367905066,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[-2.33595000e-01 -8.20225000e-03 -1.43110083e-...
4,7414204790,Lacto Calamine Skin Balance Oil control 120 ml...,[-1.88602125e-01 1.28237625e-01 7.51135500e-...


In [24]:
df=seq_df
mapping=pd.Series(df.index,index = df['title'])

In [25]:
# 每個商品回傳 k 個最相近的商品
def recommend_item(item_input, k=2):
    try:
        item_index = mapping[item_input]
        similarity_score = list(enumerate(similarity_matrix[item_index]))
        similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        similarity_score = similarity_score[:k]
        item_indices = [i[0] for i in similarity_score]
        return (df['asin'].iloc[item_indices].tolist())
    except:
        return []

In [20]:
ratings_trainings.asin.unique()

array(['0143026860', '014789302X', '0992916305', ..., 'B01HJ1K3YK',
       'B01HJ84SGM', 'B01HJASD20'], dtype=object)

In [26]:
# 利用使用者購買過的商品產生推薦
def recommend_items(items, k):
    res = []
    for d in items:
        res.extend(recommend_item(d, k))
    return res

In [27]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    content-based
    '''
    ratings_trainings = training_data
    recommendations = {user: recommend_items(metadata[metadata['asin'].isin(ratings_trainings[ratings_trainings['reviewerID'] == user]['asin'].tolist())]['title'].tolist(), k) for user in users}
    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': ['B009YDVB6Q',
  'B009YDVBIE',
  'B0090BJ4UM',
  'B017BJ7TC4',
  'B00B5MFYM6',
  'B00J1TSNZU',
  'B00N7AUJE2',
  'B017BI8YMY',
  'B00J897JU8',
  'B00AKPQY96'],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': ['B015ZOWER2',
  'B017X

## 結果評估

In [28]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.003389830508474576