<a href="https://colab.research.google.com/github/rase26/Alphacamp_Recommendation/blob/main/W1_A2_rule_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# W1_A2_rule-based

## import data & package

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [10]:
prod_meta = getDF('/content/meta_All_Beauty.json.gz') ## 商品的基本資訊
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)  ## 使用者購買商品的紀錄

In [54]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [11]:
prod_meta.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [7]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18
2,143026860,A1572GUYS7DGSR,4.0,1407628800,2014-08-10
3,143026860,A1PSGLFK1NSVO,5.0,1362960000,2013-03-11
4,143026860,A6IKXKZMTKGSC,5.0,1324771200,2011-12-25


In [55]:
ratings['DATE'].min()

Timestamp('2000-01-10 00:00:00')

## split data
- train < 2018-09-01
- test  2018-09-01 ~2018-0930

In [5]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]

ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## Rule-based Recommendation

### by prod category

In [12]:
prod_meta.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')

In [19]:
prod_meta.explode('category').category.unique()
## no any category information => cannot be the based

array([nan], dtype=object)

In [21]:
prod_meta.main_cat.unique()

array(['All Beauty'], dtype=object)

### by user past top k product 
每位user過去買的產品

In [22]:
ratings.columns

Index(['asin', 'reviewerID', 'overall', 'unixReviewTime'], dtype='object')

In [50]:

def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    ratings_trainings = training_data
    count1=training_data.groupby(['reviewerID','asin']).agg(buy_count=("asin", 'count')).reset_index()
    count1['rank']=count1.groupby('reviewerID')['buy_count'].rank("dense", ascending=False)
    count1=count1[count1['rank']<=k]
    recommendations =count1.groupby('reviewerID')['asin'].apply(list).to_dict()
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)
print(len(ratings_by_user))

323489


####  結果評估

In [49]:

def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    檢查使用者真正會購買的商品中，有哪些會出現在推薦給使用者的商品中裡
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user: ## test_user
        if d in ratings_by_user: ## prediction of user
                     ## 兩者購買產品的交集產品數
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings) ## 九月購買的商品數
    return score

r1= evaluate(ratings_testings_by_user, ratings_by_user)
print(r1)

0.0


by past same period

In [56]:
#四季 12-2 3-5 6-8 9-11 
dict_cat={'冬':[12,1,2],'春':[3,4,5],'夏':[6,7,8],'秋':[9,10,11]} 
d={k:oldk for oldk,oldv in dict_cat.items() for k in oldv} 
print(d)
ratings_trainings['季節']=ratings_trainings.DATE.dt.month.map(d) 

{12: '冬', 1: '冬', 2: '冬', 3: '春', 4: '春', 5: '春', 6: '夏', 7: '夏', 8: '夏', 9: '秋', 10: '秋', 11: '秋'}


In [70]:
ratings_trainings['month']=ratings_trainings.DATE.dt.month

In [71]:

def recommender2(training_data,period, k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    ratings_trainings = training_data
    count1=training_data.groupby([period,'asin']).agg(buy_count=("asin", 'count')).reset_index()
    count1['rank']=count1.groupby(period)['buy_count'].rank("dense", ascending=False)
    count1=count1[count1['rank']<=k]
    recommendations =count1.groupby(period)['asin'].apply(list).to_dict()
    return recommendations


ratings_by_season = recommender2(ratings_trainings,'季節')
print(len(ratings_by_season))
# ratings_by_season

4


In [73]:
ratings_by_month = recommender2(ratings_trainings,'month')
print(len(ratings_by_month))
# ratings_by_month

12


####  結果評估

In [64]:
ratings_testings['季節']=ratings_testings.DATE.dt.month.map(d) 
ratings_testings_by_season=ratings_testings.groupby('季節')['asin'].apply(list).to_dict()

In [74]:
ratings_testings['month']=ratings_testings.DATE.dt.month
ratings_testings_by_month=ratings_testings.groupby('month')['asin'].apply(list).to_dict()

In [69]:

def evaluate(ratings_testings_by_period={}, ratings_by_period={}, method=None):
    '''
    檢查使用者真正會購買的商品中，有哪些會出現在推薦給使用者的商品中裡
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_period: ## test_user
        if d in ratings_by_period: ## prediction of user
                     ## 兩者購買產品的交集產品數
            total += len(set(ratings_by_period[d]) & set(ratings_testings_by_period[d]))

    score = total / len(ratings_testings) ## 九月購買的商品數
    return score

r2= evaluate(ratings_testings_by_season, ratings_by_season)
print(r2)

0.001694915254237288


In [75]:
r3= evaluate(ratings_testings_by_month, ratings_by_month)
print(r3)

0.001694915254237288
