In [1]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 6.1 MB/s eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=404481feeb0bf7fa0d7032312d5262254cb1fc4ed301053342448a9da3552fd9
  Stored in directory: /home/phamnh1/.cache/pip/wheels/13/c7/b0/79f66658626032e78fc1a83103690ef6797d551cb22e56e734
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [2]:
import pandas as pd
import numpy as np
import csv
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import math
import heapq
# from scipy.spatial import distance
import nltk
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from IPython.display import display
from langdetect import detect, detect_langs, DetectorFactory
DetectorFactory.seed = 0
from operator import itemgetter

# load csv data
eval_csv = pd.read_csv("../data/evaluation.csv")
item_csv = pd.read_csv("../data/items.csv", sep="|", quoting=3)
tran_csv = pd.read_csv("../data/transactions.csv", sep="|")
item_with_lang_csv = pd.read_csv("../data/item_with_lang.csv", sep="|", quoting=3)

# replace nan with empty string
item_with_lang_csv = item_with_lang_csv.replace(np.nan, '', regex=True)

# compute frequency of author and publisher
authors_fre = item_with_lang_csv['author'].dropna().value_counts() / len(item_csv)
publishers_fre = item_with_lang_csv['publisher'].dropna().value_counts() / len(item_csv)

# merger eval_csv with item_with_lang_csv
eval_csv = eval_csv.merge(item_with_lang_csv, how='inner', on='itemID')



In [7]:
eval_csv

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
0,12,Breathtaking,Reva Ryann Thompson,Xlibris,FM,[],en,0.999996
1,45274,The Dead Man in Indian Creek,Mary Downing Hahn,AVON BOOKS,YFCF,"[YFZR,YXHB]",en,0.999997
2,10104,The Humble Seed,Linda Appleby,Linda Appleby,YFB,[YX],en,0.999994
3,41371,Dryadenhain & Dschinnenzauber (Märchenanthologie),"Philipp Busch, Jonathan Driedger, Lena Kalupne...",Ulisses Spiel & Medien,FM,"[FMB,WDHW]",de,0.999997
4,14015,The Man of Steel: Superman vs. the Moon Bandits,Scott Sonneborn,STONE ARCH BOOKS,YFCF,[XQK],en,0.999997
...,...,...,...,...,...,...,...,...
995,78384,Ice Cream for Breakfast,Casey Gillespie,Australian Publishing House,YFQ,[],en,0.999995
996,68157,Der ewige Krieg,Joe Haldeman,Heyne Taschenbuch,FLR,"[FJM,FLR,FM,FYT]",de,0.999995
997,44964,Back to Earth (Greek Edition): The Adventures ...,Danilo Clementoni,,FM,[],en,0.999996
998,12971,Sharks - Read it yourself with Ladybird: Level...,Ladybird,Penguin Random House Children's UK,YNNB,[YBL],en,0.999998


In [4]:
item_with_lang_csv

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],en,0.999996
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",de,0.999996
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]",en,0.571429
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]",de,0.999993
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]",de,0.999996
...,...,...,...,...,...,...,...,...
78329,37678,Timeless Fairy Tales,"Brothers Grimm, Marie-Catherine Baroness D'Aulnoy",MEDIAMORPHOSIS,YFA,[YFJ],en,0.999996
78330,68688,Demon Games,A. Witt Timothy a. Witt,iUniverse,FMB,[],de,0.684883
78331,57291,Lori and the Lion's Den,A. M. Glass,Xlibris,YFU,[],en,0.999997
78332,78130,The Everywhere Armchair,Ersila Bee,ELOQUENT BOOKS,YFC,"[YFG,YFH]",en,0.999997


In [49]:
print("=============== Original Dataset ===============")
display(item_csv.head(10))

# cut prediction lower than thrshold, with minimum of 7 characters needed in title
lang_prob_threshold = 0.9
lang_leng_threshold = 7

item_csv.loc[:,"lang"] = np.NaN
item_csv.loc[:,"lang_prob"] = 0.0

for i, row in tqdm(item_csv.iterrows(), desc = "Processing dataframe rows", total=len(item_csv)):
    row_title = row["title"]
    
    # disable detection for length < thrshold => set to 0
    if len(row_title) < lang_leng_threshold:
        pass
    else:
        try:
            det_lang, det_prob = str(detect_langs(row_title)[0]).split(":")
            det_prob = float(det_prob)
            
            # assign detected language
            item_csv.loc[i, "lang"]      = det_lang
            item_csv.loc[i, "lang_prob"] = det_prob
        except:
            pass

item_csv_with_lang = item_csv.loc[item_csv["lang_prob"]>lang_prob_threshold].sort_values("lang_prob", ascending=False)
print("Dataframe rows after processing:{} ({:.0%})".format(len(item_csv_with_lang), len(item_csv_with_lang)/len(item_csv)))
print("=============== Recommendation ===============")
display(item_csv_with_lang.head(100))



Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH]
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]"
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]"
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]"
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]"
5,34217,Ewig geliebt,J. R. Ward,Heyne Taschenbuch,FMR,"[1KBB-US-NAK,FMX,FRX,3MRBF]"
6,31436,Meine Sticker-Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLL]"
7,14576,Unsterblich 01 - Tor der Dämmerung,Julie Kagawa,Heyne Taschenbuch,YFE,"[5AQ,FM,YFE,YFH]"
8,17731,Unsterblich 02 - Tor der Nacht,Julie Kagawa,Heyne Taschenbuch,YFH,"[5AQ,FM,YFE,YFH]"
9,58723,Pedro und die Bettler von Cartagena,Ursula Hasler,dtv Verlagsgesellschaft,YFB,"[5AM,1KLSC]"


Processing dataframe rows:   0%|          | 0/78334 [00:00<?, ?it/s]

Dataframe rows after processing:63313 (81%)


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
69805,78137,The Shape of Fear,Jj Toner,JJ Toner Publishing,FLW,[],en,1.000000
48572,27137,Brexston the Bear and Ferndelia the Mermaid,Elise Morris Toucet,Xlibris,YFJ,[],en,1.000000
9961,44993,Autos und Fahrzeuge Malbuch für Kinder von 4-8...,Michelle Brilliant,Michelle Brilliant,YBG,[],de,1.000000
70530,50319,Autos und Fahrzeuge Malbuch für Kinder von 4-8...,Michelle Brilliant,Michelle Brilliant,YBG,[],de,1.000000
41684,69385,Fantasy Notizbuch: Adler im Sturm - weiße Seit...,Samuriel Sternenfeuer,Arthanan Verlag,FMM,[],de,1.000000
...,...,...,...,...,...,...,...,...
62075,13722,The Ghosts in the Garage,Kimi Cook,Austin Macauley,YFH,[],en,0.999999
28102,20433,The Days of Thy Youth,Donald J. Richardson,AuthorHouse,FMH,[],en,0.999999
15215,12794,Kleines Malbuch. Fahrzeuge,,Tessloff Verlag,YBGC,"[5AC,YBGC,YBL]",de,0.999999
63067,57067,Beast Quest: Petorix the Winged Slicer,Adam Blade,Hachette Children's Group,YFC,"[5AH,4CD,YFH]",en,0.999999


In [50]:
item_csv

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH],en,0.999996
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]",de,0.999996
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]",en,0.571429
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]",de,0.999993
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]",de,0.999996
...,...,...,...,...,...,...,...,...
78329,37678,Timeless Fairy Tales,"Brothers Grimm, Marie-Catherine Baroness D'Aulnoy",MEDIAMORPHOSIS,YFA,[YFJ],en,0.999996
78330,68688,Demon Games,A. Witt Timothy a. Witt,iUniverse,FMB,[],de,0.684883
78331,57291,Lori and the Lion's Den,A. M. Glass,Xlibris,YFU,[],en,0.999997
78332,78130,The Everywhere Armchair,Ersila Bee,ELOQUENT BOOKS,YFC,"[YFG,YFH]",en,0.999997


In [51]:
# item_csv.to_csv("../data/item_with_lang.csv", sep="|", index=False)

In [61]:
print("=====NA Counts=====")
display(item_csv.isnull().sum())

=====NA Counts=====


itemID           0
title            0
author        3247
publisher        9
main topic     259
subtopics        0
dtype: int64

In [25]:
####
## TASK 1: Find suitable similarity measurement between branch in tree (Main topic)

## Similarity = 1 / (1 + Distance)

## IDEA 1: Distance = length(v_i, lca_ij) + length(v_j, lca_ij) 

## IDEA 2: Euclidean distance = sqrt(length(v_i, lca_ij) + length(v_j, lca_ij))

## IDEA 3: Distance = (length(v_i, lca_ij) + length(v_j, lca_ij)) / (1 + length(lca_ij, root)), 
##       To highlight that the deeper the topic are, the more similar they will be

## IDEA 4: Tanimoto similarity: Similarity = (1 + length(lca_ij, root)) / (1 + length(lca_ij, root) + length(lca_ij, v_i) + length(lca_ij, v_j))
##                              Distance = (length(v_i, lca_ij) + length(v_j, lca_ij)) / (1 + length(lca_ij, root) + length(v_i, lca_ij) + length(v_j, lca_ij)), 


## TASK 2: Find suitable similariry measurement between branch in tree (Sub-topics)

## Get the average of 3-largest distance of subtopic ()

## Reference: https://elar.urfu.ru/bitstream/10995/3713/2/RuSSIR_2011_07.pdf

####
## TASK 3: Combine the distance together

### Topic measurement

In [21]:
### USAGE:
# 1. Init class: c = HierarchialDistance()
# 2. Call function:
#    a. Main topic (option, main_topic_1, main_topic_2): 
#          option (1,2,3,4): indicate which calculation method to choose
#          main_topic_1, main_topic_2: string
#    b. Subtopic (option, sub_topic_1, sub_topic_2)
#          option (1,2,3,4): indicate which calculation method to choose
#          sub_topic_1, sub_topic_2: list of string (average of max)

import numpy as np

class HierarchialDistance:
    def __init__(self):
        return 
    
    def find_lca(self, a, b):
        lca = -1
        n = len(a) if len(a) < len(b) else len(b)
        for i in range(0, n):
            if a[i] == b[i]:
                lca = i
            else:
                break
        return lca
    
    def similarity(self, distance):
        return 1/(1+distance)
    
    def pathbased_distance(self, a, b):
        lca = self.find_lca(a,b)
        if lca == -1:
            distance = np.inf
        else:
            a_dis = len(a) - lca - 1
            b_dis = len(b) - lca - 1
            distance = a_dis + b_dis
        sim = self.similarity(distance)
        return distance, sim
    
    def euclidean_distance(self, a, b):
        lca = self.find_lca(a,b)
        if lca == -1:
            distance = np.inf
        else:
            a_dis = len(a) - lca - 1
            b_dis = len(b) - lca - 1
            distance = np.sqrt(a_dis + b_dis)
        sim = self.similarity(distance)
        return distance, sim
    
    def adjust_pathbased_distance(self, a, b):
        lca = self.find_lca(a,b)
        if lca == -1:
            distance = np.inf
        else:
            a_dis = len(a) - lca - 1
            b_dis = len(b) - lca - 1
            root_dis = lca
            distance = (a_dis + b_dis) / (1 + root_dis)
        sim = self.similarity(distance)
        return distance, sim
    
    def tanimoto_similarity(self, a, b):
        lca = self.find_lca(a,b)
        if lca == -1:
            distance = np.inf
            sim = 0
        else:
            a_dis = len(a) - lca - 1
            b_dis = len(b) - lca - 1
            root_dis = lca
            distance = (a_dis + b_dis) / (1 + root_dis + a_dis + b_dis)
            sim = 1 - distance
        return distance, sim
    
    def compare_topic(self, option, a, b):
        if not a or not b:
            distance = np.inf
            sim = 0
        elif option == 1:
            distance, sim = self.pathbased_distance(a, b)
        elif option == 2:
            distance, sim = self.euclidean_distance(a, b)
        elif option == 3:
            distance, sim = self.adjust_pathbased_distance(a, b)
        else:
            distance, sim = self.tanimoto_similarity(a, b)
        return distance, sim
            
    def main_topic(self, option, main_topic_1, main_topic_2):
        return self.compare_topic(option, main_topic_1, main_topic_2)
        
    def sub_topic_compare(self, option, sub_topic_1, sub_topic_2):
        # get the n_item-highest average weight
        if not sub_topic_1 or not sub_topic_2:
            distance = np.inf
            sim = 0
        else:
            # NEED TO NORMALIZE THIS
            distance_lst = []
            sim_lst = []
            for a in sub_topic_1:
                for b in sub_topic_2:
                    distance, sim = self.compare_topic(option, a, b)
                    distance_lst.append(distance)
                    sim_lst.append(sim)
            # get the average of longer array
            n = len(sub_topic_2) if len(sub_topic_1) < len(sub_topic_2) else len(sub_topic_1)
            distance = np.average(sorted(distance_lst, reverse=False)[:n])
            sim = np.average(sorted(sim_lst, reverse=True)[:n])
        return distance, sim

In [22]:
c = HierarchialDistance()

In [23]:
%%timeit
a = "ACD"
b = "BDF"

c.main_topic(4, a, b)

1.18 µs ± 8.85 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [24]:
lst = [1, 2, 3, 4, 5]

lst = lst / np.sqrt(np.sum([np.square(a) for a in lst]))

### Title similarity measurement

In [25]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from IPython.display import display

In [26]:
def fuzzy_score(src_title, tgt_title):
    if tgt_title == "" or src_title == "":
        return 0
    else:
        basic_ratio = fuzz.ratio(src_title, tgt_title)
        token_ratio = fuzz.token_sort_ratio(src_title, tgt_title)
        score = max([basic_ratio,token_ratio])
        return score/100

### Author and Publisher with GoodAll measurement

In [27]:
## get frequency of item

def author_publisher_measurement(author_1, publisher_1, author_2, publisher_2):
    total = 0
    author_weight = 2/3
    publisher_weight = 1/3
    if author_1 == '' or author_2 == '':
        author_weight = 0
    if publisher_1 == '' or publisher_2 == '':
        publisher_weight = 0
    if author_1 == author_2:
        total += author_weight * (1 - np.square(authors_fre[author_1]))
    if publisher_1 == publisher_2:
        total += publisher_weight * (1 - np.square(publishers_fre[publisher_1]))
    return total

In [28]:
%%timeit
author_1 = "Mary Downing Hahn"
publisher_1 = "Schwager und Steinlein"
author_2 = "Mary Downing Hahn"
publisher_2 = "Australian Publishing House"
author_publisher_measurement(author_1, publisher_1, author_2, publisher_2)

15.7 µs ± 2.34 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### Language Measurement

In [29]:
item_with_lang_csv = pd.read_csv("../data/item_with_lang.csv", sep="|", quoting=3)

def lang_compare(title_1, lang_1, conf_1, title_2, lang_2, conf_2):
    if len(title_1) < 10 or len(title_2) < 10:
        return -1
    else:
        if lang_1 == lang_2 and conf_1 > 0.9 and conf_2 > 0.9:
            return conf_1 * conf_2
        else:
            return -1

In [30]:
%%timeit
title_1 = "The Dead Man in Indian Creek"
lang_1 = "en"
conf_1 = 0.99
title_2 = "Timeless Fairy Tales"
lang_2 = "en"
conf_2 = 0.92

lang_compare(title_1, lang_1, conf_1, title_2, lang_2, conf_2)

419 ns ± 45.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


### Weight for item feature in item_with_lang_csv

1. Lang + confident (0 or 0.1)
2. Title (0.3 or 0.2)
3. Author + Publisher (0.4)
4. Main topic (0.2)
5. Sub topic (0.1)

In [31]:
# pandas function
c = HierarchialDistance()
def similarity_score(title_1, author_1, publisher_1, main_topic_1, subtopics_1, lang_1, lang_prob_1, title_2, author_2, publisher_2, main_topic_2, subtopics_2, lang_2, lang_prob_2):
    topic_option = 4
    w1 = 0.1
    w2 = 0.2
    w3 = 0.4
    w4 = 0.2
    w5 = 0.1
    # 1. Check language score
    lang_score = lang_compare(title_1, lang_1, lang_prob_1, title_2, lang_2, lang_prob_2)
    if lang_score == -1:
        w1 = 0
        w2 = 0.3
    # 2. Check title score
    title_score = fuzzy_score(title_1, title_2)
    # 3. Check author + publisher score
    author_publisher_score = author_publisher_measurement(author_1, publisher_1, author_2, publisher_2)
    # 4. Check main topic
    _, main_topic_score = c.main_topic(topic_option, main_topic_1, main_topic_2)
    # 5. Check sub topics
    _, subtopics_score = c.sub_topic_compare(topic_option, subtopics_1, subtopics_2)
    
    #Total accumulation point
    return w1*lang_score + w2*title_score + w3*author_publisher_score + w4*main_topic_score + w5*subtopics_score

In [32]:
%%timeit
title_1="The Dead Man in Indian Creek"
author_1="Mary Downing Hahn"
publisher_1="AVON BOOKS"
main_topic_1="YFCF"
subtopics_1=["YFZR","YXHB"]
lang_1="en"
lang_prob_1=0.999997
title_2="The Man of Steel: Superman vs. the Moon Bandits"
author_2="Scott Sonneborn"
publisher_2="STONE ARCH BOOKS"
main_topic_2="YFCF"
subtopics_2=["XQK"]
lang_2="en"
lang_prob_2=0.999997
similarity_score(title_1, author_1, publisher_1, main_topic_1, subtopics_1, lang_1, lang_prob_1, title_2, author_2, publisher_2, main_topic_2, subtopics_2, lang_2, lang_prob_2)

300 µs ± 4.89 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [39]:
for index_1, row_1 in eval_csv.iterrows():
    # First stop: 329
    # Second stop: 753
    if index_1 <= 753:
        continue
    itemID_1     = row_1['itemID']
    title_1      = row_1['title']
    author_1     = row_1['author']
    publisher_1  = row_1['publisher']
    main_topic_1 = row_1['main topic']
    subtopics_1  = row_1['subtopics'].lstrip("[").rstrip("]").split(",")
    if subtopics_1 == ['']:
        subtopics_1 = []
    lang_1       = row_1['lang']
    lang_prob_1  = row_1['lang_prob']
    sim_dict = {}
    
    for index_2, row_2 in item_with_lang_csv.iterrows():
        itemID_2     = row_2['itemID']
        title_2      = row_2['title']
        author_2     = row_2['author']
        publisher_2  = row_2['publisher']
        main_topic_2 = row_2['main topic']
        subtopics_2  = row_2['subtopics'].lstrip("[").rstrip("]").split(",")
        if subtopics_2 == ['']:
            subtopics_2 = []
        lang_2       = row_2['lang']
        lang_prob_2  = row_2['lang_prob']
        
        # skip same book
        if title_2 == title_1:
            continue

        sim_score = similarity_score(title_1, author_1, publisher_1, main_topic_1, subtopics_1, lang_1, lang_prob_1, title_2, author_2, publisher_2, main_topic_2, subtopics_2, lang_2, lang_prob_2)
        sim_dict[str(itemID_2)] = round(sim_score, 4)
    max_10 = dict(sorted(sim_dict.items(), key = itemgetter(1), reverse = True)[:10])
    with open("../data/result.txt", "a") as myfile:
        myfile.write(str(max_10)+"\n")
    print("Finished row {}".format(index_1))

Finished row 754
Finished row 755
Finished row 756
Finished row 757
Finished row 758
Finished row 759
Finished row 760
Finished row 761
Finished row 762
Finished row 763
Finished row 764
Finished row 765
Finished row 766
Finished row 767
Finished row 768
Finished row 769
Finished row 770
Finished row 771
Finished row 772
Finished row 773
Finished row 774
Finished row 775
Finished row 776
Finished row 777
Finished row 778
Finished row 779
Finished row 780
Finished row 781
Finished row 782
Finished row 783
Finished row 784
Finished row 785
Finished row 786
Finished row 787
Finished row 788
Finished row 789
Finished row 790
Finished row 791
Finished row 792
Finished row 793
Finished row 794
Finished row 795
Finished row 796
Finished row 797
Finished row 798
Finished row 799
Finished row 800
Finished row 801
Finished row 802
Finished row 803
Finished row 804
Finished row 805
Finished row 806
Finished row 807
Finished row 808
Finished row 809
Finished row 810
Finished row 811
Finished row 8

In [38]:
%%timeit
row_1 = eval_csv.iloc[0]
itemID_1     = row_1['itemID']
title_1      = row_1['title']
author_1     = row_1['author']
publisher_1  = row_1['publisher']
main_topic_1 = row_1['main topic']
subtopics_1  = row_1['subtopics'].lstrip("[").rstrip("]").split(",")
if subtopics_1 == ['']:
    subtopics_1 = []
lang_1       = row_1['lang']
lang_prob_1  = row_1['lang_prob']
sim_dict = {}

for index_2, row_2 in item_with_lang_csv.iterrows():
    itemID_2     = row_2['itemID']
    title_2      = row_2['title']
    author_2     = row_2['author']
    publisher_2  = row_2['publisher']
    main_topic_2 = row_2['main topic']
    subtopics_2  = row_2['subtopics'].lstrip("[").rstrip("]").split(",")
    if subtopics_2 == ['']:
        subtopics_2 = []
    lang_2       = row_2['lang']
    lang_prob_2  = row_2['lang_prob']

    # skip same book
    if title_2 == title_1:
        continue

    sim_score = similarity_score(title_1, author_1, publisher_1, main_topic_1, subtopics_1, lang_1, lang_prob_1, title_2, author_2, publisher_2, main_topic_2, subtopics_2, lang_2, lang_prob_2)
    sim_dict[str(itemID_2)] = sim_score
max_10 = dict(sorted(sim_dict.items(), key = itemgetter(1), reverse = True)[:10])
# print (max_10)

22.9 s ± 524 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [63]:
with open("../data/result.txt", "r") as myfile:
    i = 0
    while(True):
        #read next line
        line = myfile.readline()
        #if line is empty, you are done with all lines in the file
        if not line:
            print (i)
            break
        i += 1

330


# Scoring Explaination

The process contain 2 step

### Step 1: Scoring base on relevant item.csv information.

The relevant item is computed base on these element (All elements are in range of 0 to 1)
#### 1. Lang + confidence (Weight = 0 or 0.1)
    
    We use a language detection model to predict the language of the title. If the number of character in title > 10 and confident > 0.9, the score will be computed as: confidence * (1 if lang_item_1 == lang_item_2 else 0)
    Weight: 0.1 if computed or 0.0 if not computed
    
#### 2. Title (0.3 or 0.2)

    We use fuzzy_ratio to compute similarity in ratio of title between 2 title.
    Weight: 0.2 if language is computed or 0.3 if language is not computed

#### 3. Author + Publisher (0.4)

    We use good all measurement to compute the similarity between 2 rows. Author receives 66% of importance, publisher receives 33%.
    Weight: 0.4

#### 4. Main topic (0.2)

    We use tanimoto similarity to compute the similarity between 2 rows.
    Weight: 0.2

#### 5. Sub topic (0.1)

    We use tanimoto similarity to compute the similarity between 2 rows.
    Weight: 0.1

### Step 2: Scoring base on popularity score

    After getting 5 recommendations in step 1, we use popularity score to rearrange the order of recommendation.
    The popularity score is compute as follow:
        1. Group numbers of clicks, baskets, order by "itemID"
        2. Normalize by RobustScaler
        3. popularity_score = 0.2*click + 0.5*basket + 0.3*order

# Popularity score

In [3]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

popularity = tran_csv.drop(['sessionID'], axis=1).groupby(['itemID']).sum().reset_index()
scaler = RobustScaler()
itemID = popularity['itemID']
popularity = popularity.drop('itemID', axis=1)
df_std = pd.DataFrame(scaler.fit_transform(popularity), columns=popularity.columns)
df_std['itemID'] =  itemID

## READ RESULT FROM ITEM.CSV

In [4]:
import json
d = {}
with open("../data/result.txt") as f:
    i = 0 
    for line in f:
        line = line.replace("\'", "\"")
        d2 = json.loads(line)
        item = eval_csv['itemID'].iloc[i]
        d[item] = d2
        i+=1

In [5]:
i=234
eval_csv.iloc[i]

itemID                                  23643
title                      Up, Up, in the Air
author                          Adrian Powell
publisher     Prolific Imagination Publishing
main topic                               YNTR
subtopics                               [YFB]
lang                                       en
lang_prob                            0.999999
Name: 234, dtype: object

In [6]:
ids = eval_csv['itemID'].iloc[i]
test_list = list(map(int, d[ids].keys()))[:5]
item_with_lang_csv.set_index('itemID').loc[test_list].reset_index()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
0,32468,Quint the Bookmobile,Kathleen Quinton,Quintessential Productions,YNTR,[YFB],en,0.999996
1,48921,THE SANDMAN,William J. Hopkins,Yesterday's Classics,YNTR,[YFB],en,0.999993
2,52601,THE SANDMAN,William J. Hopkins,Yesterday's Classics,YNTR,[YFB],en,0.999993
3,36507,Billy the Bus,Mark McDaid,New Generation Publishing,YNTR,[YFB],en,0.999998
4,71133,Lewis and the Lighthouse,Bill Mckibben,Just Write Books,YNTR,[YFB],en,0.999997


In [7]:
def compute(click, basket, order):
    return click*0.2 + basket*0.5 + order*0.3
    
diction = {}
list_id = df_std['itemID'].values.tolist()
for item in test_list:
    if item in list_id:
        score = compute(df_std[df_std['itemID']==item]['click'], df_std[df_std['itemID']==item]['basket'], df_std[df_std['itemID']==item]['order']).values[0]
    else:
        score = -10000
    diction[item] = score
    
diction = dict(sorted(diction.items(), reverse=True, key=lambda item: item[1]))
diction

{32468: -10000, 48921: -10000, 52601: -10000, 36507: -10000, 71133: -10000}

In [8]:
final_list = list(map(int, diction.keys()))
item_with_lang_csv.set_index('itemID').loc[final_list].reset_index()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
0,32468,Quint the Bookmobile,Kathleen Quinton,Quintessential Productions,YNTR,[YFB],en,0.999996
1,48921,THE SANDMAN,William J. Hopkins,Yesterday's Classics,YNTR,[YFB],en,0.999993
2,52601,THE SANDMAN,William J. Hopkins,Yesterday's Classics,YNTR,[YFB],en,0.999993
3,36507,Billy the Bus,Mark McDaid,New Generation Publishing,YNTR,[YFB],en,0.999998
4,71133,Lewis and the Lighthouse,Bill Mckibben,Just Write Books,YNTR,[YFB],en,0.999997


# Output to submitted file

In [10]:
final_df = pd.DataFrame(columns=['itemID', 'rec_1', 'rec_2', 'rec_3', 'rec_4', 'rec_5'])
for i in range(1000):
    ids = eval_csv['itemID'].iloc[i]
    test_list = list(map(int, d[ids].keys()))[:5]
    ### check with transaction
    diction = {}
    list_id = df_std['itemID'].values.tolist()
    for item in test_list:
        if item in list_id:
            score = compute(df_std[df_std['itemID']==item]['click'], df_std[df_std['itemID']==item]['basket'], df_std[df_std['itemID']==item]['order']).values[0]
        else:
            score = -10000
        diction[item] = score
    diction = dict(sorted(diction.items(), reverse=True, key=lambda item: item[1]))
    final_list = list(map(int, diction.keys()))
    final_df.loc[i] = [ids] + final_list


In [366]:
final_df

Unnamed: 0,itemID,rec_1,rec_2,rec_3,rec_4,rec_5
0,12,60968,16667,65244,68590,78903
1,45274,7262,6210,1746,18662,15116
2,10104,10143,58537,5743,55137,37904
3,41371,52305,21810,66046,72384,68753
4,14015,59077,54587,19563,47952,13103
...,...,...,...,...,...,...
995,78384,56354,9143,12464,1778,74303
996,68157,12224,41558,41398,46377,39215
997,44964,74918,3817,21218,45452,26648
998,12971,49248,32816,15800,56107,63032


In [404]:
final_df.to_csv("../data/final_result.csv", sep="|", index=False, encoding='utf-8')