In [1]:
import pandas as pd
import numpy as np
import csv
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import math
# from scipy.spatial import distance
import nltk
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from IPython.display import display
from langdetect import detect, detect_langs, DetectorFactory
DetectorFactory.seed = 0

# load csv data
eval_csv = pd.read_csv("../data/evaluation.csv")
item_csv = pd.read_csv("../data/items.csv", sep="|", quoting=3)
tran_csv = pd.read_csv("../data/transactions.csv", sep="|")

In [2]:
# check dataset
display(item_csv.head(5))

print("=====NA Counts=====")
display(item_csv.isnull().sum())

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH]
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]"
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]"
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]"
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]"


=====NA Counts=====


itemID           0
title            0
author        3247
publisher        9
main topic     259
subtopics        0
dtype: int64

In [3]:
print("=============== Original Dataset ===============")
display(item_csv.head(10))

# cut prediction lower than thrshold, with minimum of 7 characters needed in title
lang_prob_threshold = 0.9
lang_leng_threshold = 7

item_csv.loc[:,"lang"] = np.NaN
item_csv.loc[:,"lang_prob"] = 0.0

for i, row in tqdm(item_csv.iterrows(), desc = "Processing dataframe rows", total=len(item_csv)):
    row_title = row["title"]
    
    # disable detection for length < thrshold => set to 0
    if len(row_title) < lang_leng_threshold:
        pass
    else:
        try:
            det_lang, det_prob = str(detect_langs(row_title)[0]).split(":")
            det_prob = float(det_prob)
            
            # assign detected language
            item_csv.loc[i, "lang"]      = det_lang
            item_csv.loc[i, "lang_prob"] = det_prob
        except:
            pass

item_csv_with_lang = item_csv.loc[item_csv["lang_prob"]>lang_prob_threshold].sort_values("lang_prob", ascending=False)
print("Dataframe rows after processing:{} ({:.0%})".format(len(item_csv_with_lang), len(item_csv_with_lang)/len(item_csv)))
print("=============== Recommendation ===============")
display(item_csv_with_lang.head(100))



Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,[5AH]
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"[5AJ,AGZ,WFA,YBG,YBL,YNA,YPA]"
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"[5AP,FBA]"
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]"
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"[WD,WFTM,YBG,YBL,YBLD,YBLN1]"
5,34217,Ewig geliebt,J. R. Ward,Heyne Taschenbuch,FMR,"[1KBB-US-NAK,FMX,FRX,3MRBF]"
6,31436,Meine Sticker-Tiere,,Ars Edition GmbH,YBG,"[5AD,YBG,YBLL]"
7,14576,Unsterblich 01 - Tor der Dämmerung,Julie Kagawa,Heyne Taschenbuch,YFE,"[5AQ,FM,YFE,YFH]"
8,17731,Unsterblich 02 - Tor der Nacht,Julie Kagawa,Heyne Taschenbuch,YFH,"[5AQ,FM,YFE,YFH]"
9,58723,Pedro und die Bettler von Cartagena,Ursula Hasler,dtv Verlagsgesellschaft,YFB,"[5AM,1KLSC]"


Processing dataframe rows:   0%|          | 0/78334 [00:00<?, ?it/s]

Dataframe rows after processing:63313 (81%)


Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,lang,lang_prob
69805,78137,The Shape of Fear,Jj Toner,JJ Toner Publishing,FLW,[],en,1.000000
48572,27137,Brexston the Bear and Ferndelia the Mermaid,Elise Morris Toucet,Xlibris,YFJ,[],en,1.000000
9961,44993,Autos und Fahrzeuge Malbuch für Kinder von 4-8...,Michelle Brilliant,Michelle Brilliant,YBG,[],de,1.000000
70530,50319,Autos und Fahrzeuge Malbuch für Kinder von 4-8...,Michelle Brilliant,Michelle Brilliant,YBG,[],de,1.000000
41684,69385,Fantasy Notizbuch: Adler im Sturm - weiße Seit...,Samuriel Sternenfeuer,Arthanan Verlag,FMM,[],de,1.000000
...,...,...,...,...,...,...,...,...
62075,13722,The Ghosts in the Garage,Kimi Cook,Austin Macauley,YFH,[],en,0.999999
28102,20433,The Days of Thy Youth,Donald J. Richardson,AuthorHouse,FMH,[],en,0.999999
15215,12794,Kleines Malbuch. Fahrzeuge,,Tessloff Verlag,YBGC,"[5AC,YBGC,YBL]",de,0.999999
63067,57067,Beast Quest: Petorix the Winged Slicer,Adam Blade,Hachette Children's Group,YFC,"[5AH,4CD,YFH]",en,0.999999


# Time test for language detection

With cut off at 0.9 and minimum 7 characters in title

In [6]:
%%timeit

auth = item_csv.set_index("author").loc["Victoria Aveyard"].set_index("itemID")
# print("=============== Original Dataset ===============")
# display(auth.head(10))

# cut prediction lower than thrshold, with minimum of 7 characters needed in title
lang_threshold = 0.9


auth.loc[:,"lang"] = np.NaN
auth.loc[:,"lang_prob"] = 0.0

for i, row in auth.iterrows():
    row_title = row["title"]
    
    det_lang, det_prob = str(detect_langs(row_title)[0]).split(":")
    det_prob = float(det_prob)
    

    # assign detected language
    auth.loc[i, "lang"]      = det_lang
    auth.loc[i, "lang_prob"] = det_prob
    
# print("=============== Recommendation ===============")
# display(auth.loc[auth["lang_prob"]>lang_prob_threshold].sort_values("lang_prob", ascending=False).head(100))

146 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
