In [1]:
import pandas as pd
import datetime
import pytz
from operator import itemgetter
import fasttext
import csv
from difflib import SequenceMatcher

base_path = "../"

In [2]:
# pd.read_csv('W_SC_11_6.csv').sort_values(['score'], ascending=False)

In [3]:
orig_items = pd.read_pickle(f"{base_path}data/items_wd_v2.pkl")
test = pd.read_csv(f"{base_path}data/evaluation.csv").join(orig_items.set_index('itemID'), on='itemID').rename(columns={"main topic": "main_topic"})

# loading train items without any duplicates
items = pd.read_pickle(f"{base_path}data/items_v2.pkl").set_index('itemID').rename(columns={"main topic": "main_topic"})

print(f'Item columns {items.columns.tolist()}')

Item columns ['title', 'author', 'publisher', 'main_topic', 'subtopics']


# Detect language and score

In [4]:
model = fasttext.load_model(f"{base_path}fast_text_models/lid.176.bin")

def langdetect_score(title):
        title = title.replace('\n','')
        score = model.predict(title)[1][0]
        
        return score

def langdetect_lang(title):
        title = title.replace('\n','')
        language = str(model.predict(title)[0])

        return language

def trim(language):
    language = language.replace(language[:11], '')
    language = language[:-3]
    
    return language




In [5]:
orig_items['language'] = orig_items['title'].apply(langdetect_lang).apply(trim)
orig_items['language_score'] = orig_items['title'].apply(langdetect_score)

items['language'] = items['title'].apply(langdetect_lang).apply(trim)
items['language_score'] = items['title'].apply(langdetect_score)

test['language'] = test['title'].apply(langdetect_lang).apply(trim)
test['language_score'] = test['title'].apply(langdetect_score)

# Generate recommendations

In [6]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def title_handle(title1, title2):
    sim_title = similar(title1, title2)
    if(sim_title == 1):
        return 0
    else:
        return sim_title

def author_handle(author1, author2):
    sim_author = similar(author1, author2)
    if(sim_author < 0.8 or author1 == "Unknown" or author2 == "Unknown"):
        return 0
    else:
        return sim_author

def lang_handle(lang1, lang2, score1):
    if(similar(lang1, lang2) == 1):
        return score1
    else:
        return 0


def measure_similarity(book1, book2, scorings):
    sim_title = title_handle(book1.title,book2.title)
    sim_author = author_handle(book1.author, book2.author)
    sim_lang = lang_handle(book1.language, book2.language, book1.language_score)
    sim_main_topic = similar(book1.main_topic, book2.main_topic)
    sim_pub = similar(book1.publisher, book2.publisher)

    
    score = (
        (sim_author * scorings['sim_author']) + 
            (sim_main_topic * scorings['sim_main_topic']) + 
            (sim_title * scorings['sim_title']) + 
            (sim_lang * scorings['sim_lang']) +
            (sim_pub * scorings['sim_publisher'])
    )
#     score = sim_author * 0.2 + sim_main_topic * 0.3 + sim_title * 0.3 + sim_lang * 0.2
    
    return score


def get_top_n_recommendation(test_case, items, n, scorings):
    recommendations = []
    for index, item in items.iterrows():
        score = measure_similarity(item, test_case, scorings)
        if(score < 1.0):
            recommendations.append(
                {
                    'test_id': test_case.itemID,
                    'rec_id': index,
                    'score': round(score, 4)
                }
            )
            
    
    top_n = sorted(recommendations, key=itemgetter('score'), reverse=True)[:n]

    return top_n


def save_recommendations(result_df, prefix=None):
    # Saving recommendations
    if prefix is None:
        berlin_now = datetime.datetime.now(pytz.timezone('Europe/Berlin'))
        date_time = berlin_now.strftime("%d_%m_%Y_%H_%M_%S")
        prefix = f'{base_path}result/{date_time}'

    result_file_name_1 = f'{prefix}_recommendation_ids.csv'
    result_file_name_2 = f'{prefix}_recommendations.csv'
    
    result_df.to_csv(result_file_name_1, index=False)
    
    result_df_with_details = result_df.join(
        orig_items.set_index('itemID'),
        on='test_id',
        rsuffix='_test'
    ).join(
        orig_items.set_index('itemID'),
        on='rec_id',
        rsuffix='_rec'
    ).sort_values(by=['test_id', 'score'], ascending=[True, False])
    
    result_df_with_details.to_csv(result_file_name_2, index=False)


    print(f'Recommendations generated successfully. Files: \n\t{result_file_name_1}\n\t{result_file_name_2}')
    
    return result_df_with_details


def generate_recommendations(test_cases, n=30, scorings={
    'sim_title': 0.44,
    'sim_author': 0.18,
    'sim_lang': 0.08,
    'sim_publisher': 0.28,
    'sim_main_topic': 0.003,
    
}):
    result = { 
        'test_id': [],
        'rec_id': [],
        'score': []
    }
    
    counter = 0
    for index, test_case in test_cases.iterrows():
        counter = counter + 1
        print(f"({counter}) Working on the test item: id = {test_case['itemID']}, title = {test_case['title']}")
        recommendations = get_top_n_recommendation(test_case, items, n, scorings)

        for recommendation in recommendations:
            result['test_id'].append(recommendation['test_id'])
            result['rec_id'].append(recommendation['rec_id'])
            result['score'].append(recommendation['score'])


    result_df = pd.DataFrame(result).sort_values(by=['test_id', 'score'], ascending=[True, False])
    
    print(f'\n{n} recommendations generated for each of the {len(test_cases)} items\n')
    
    return result_df

In [18]:
result_df = generate_recommendations(test_cases=test.sample(2), n=5)
# result_df = generate_recommendations(test_cases=test.head(5), n=5)

result_df_with_details = save_recommendations(result_df, prefix=f'{base_path}result/test')

print(result_df.test_id.nunique(), len(result_df))
print(result_df_with_details.test_id.nunique(), len(result_df_with_details))
result_df_with_details

(1) Working on the test item: id = 55501, title = The Royal Ranger (Ranger's Apprentice Book 12)
(2) Working on the test item: id = 61339, title = Ein Pferd namens Milchmann

5 recommendations generated for each of the 2 items

Recommendations generated successfully. Files: 
	../result/test_recommendation_ids.csv
	../result/test_recommendations.csv
2 10
2 10


Unnamed: 0,test_id,rec_id,score,title,author,publisher,main_topic,subtopics,language,language_score,title_rec,author_rec,publisher_rec,main_topic_rec,subtopics_rec,language_rec,language_score_rec
0,55501,67879,0.8588,The Royal Ranger (Ranger's Apprentice Book 12),John (Author) Flanagan,Penguin Random House Children's UK,YFC,"[5AK,YFH]",en,0.530064,The Burning Bridge (Ranger's Apprentice Book 2),John (Author) Flanagan,Penguin Random House Children's UK,YFH,[5AK],en,0.70221
1,55501,22014,0.8535,The Royal Ranger (Ranger's Apprentice Book 12),John (Author) Flanagan,Penguin Random House Children's UK,YFC,"[5AK,YFH]",en,0.530064,The Icebound Land (Ranger's Apprentice Book 3),John (Author) Flanagan,Penguin Random House Children's UK,YFH,[5AK],en,0.589497
2,55501,28841,0.8425,The Royal Ranger (Ranger's Apprentice Book 12),John (Author) Flanagan,Penguin Random House Children's UK,YFC,"[5AK,YFH]",en,0.530064,Oakleaf Bearers (Ranger's Apprentice Book 4),John (Author) Flanagan,Penguin Random House Children's UK,YFH,[5AK],en,0.72318
3,55501,6722,0.8398,The Royal Ranger (Ranger's Apprentice Book 12),John (Author) Flanagan,Penguin Random House Children's UK,YFC,"[5AK,YFH]",en,0.530064,Erak's Ransom (Ranger's Apprentice Book 7),John (Author) Flanagan,Penguin Random House Children's UK,YFC,[5AK],en,0.584537
4,55501,73054,0.8388,The Royal Ranger (Ranger's Apprentice Book 12),John (Author) Flanagan,Penguin Random House Children's UK,YFC,"[5AK,YFH]",en,0.530064,The Siege of Macindaw (Ranger's Apprentice Boo...,John (Author) Flanagan,Penguin Random House Children's UK,YFC,[],en,0.801659
5,61339,53854,0.576,Ein Pferd namens Milchmann,Hilke Rosenboom,Carlsen Verlag GmbH,YFP,"[5AJ,YFP,YFQ]",de,0.843264,Ein Meer aus Tinte und Gold,Traci Chee,Carlsen Verlag GmbH,YFZV,"[5AQ,5LF,FDK,FJ,FM,WFU,YFB,YFC,YFH,YFZV,YXE,YX...",de,0.98068
6,61339,72375,0.5678,Ein Pferd namens Milchmann,Hilke Rosenboom,Carlsen Verlag GmbH,YFP,"[5AJ,YFP,YFQ]",de,0.843264,Elefanten sieht man nicht,Susan Kreller,Carlsen Verlag GmbH,JBFK3,"[5AQ,JBFK,JBFK3,YFB,YFN,YXE,YXF,YXQ]",de,0.999598
7,61339,8863,0.5485,Ein Pferd namens Milchmann,Hilke Rosenboom,Carlsen Verlag GmbH,YFP,"[5AJ,YFP,YFQ]",de,0.843264,Der mechanische Prinz,Andreas Steinhöfel,Carlsen Verlag GmbH,YFB,"[5AN,5HPD,YFB,YFC,YFH]",de,0.991266
8,61339,35421,0.5452,Ein Pferd namens Milchmann,Hilke Rosenboom,Carlsen Verlag GmbH,YFP,"[5AJ,YFP,YFQ]",de,0.843264,Sophie auf den Dächern,Katherine Rundell,Carlsen Verlag GmbH,YFH,"[5AM,YFN]",de,0.99846
9,61339,51671,0.5436,Ein Pferd namens Milchmann,Hilke Rosenboom,Carlsen Verlag GmbH,YFP,"[5AJ,YFP,YFQ]",de,0.843264,Hundert Stunden Nacht,Anna Woltz,Carlsen Verlag GmbH,YFB,"[5AQ,YFB,YFM,YFN]",de,0.929339


In [12]:
sanity_df = pd.read_csv(f"{base_path}result/test_recommendations.csv")
sanity_df

Unnamed: 0,test_id,rec_id,score,title,author,publisher,main_topic,subtopics,language,language_score,title_rec,author_rec,publisher_rec,main_topic_rec,subtopics_rec,language_rec,language_score_rec
0,56300,53497,0.7111,Die Maus - Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,Y,"[5A,5AC,YB,YF]",de,0.900756,Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,YBG,"[5AC,YBL,YXHB,YZG,YZS]",de,0.830578
1,56300,15084,0.6831,Die Maus - Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,Y,"[5A,5AC,YB,YF]",de,0.900756,Meine Kindergartenfreunde,Janosch,Ars Edition GmbH,Y,"[5AC,YB,YF]",de,0.743462
2,56300,29525,0.6725,Die Maus - Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,Y,"[5A,5AC,YB,YF]",de,0.900756,Meine Kindergarten-Freunde (Fußball),Unknown,Ars Edition GmbH,YBG,"[5AC,YBG,YBL,YXHB,YZG,YZS]",de,0.969145
3,56300,31293,0.6696,Die Maus - Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,Y,"[5A,5AC,YB,YF]",de,0.900756,Meine Kindergarten-Freunde (Pferde),Unknown,Ars Edition GmbH,YBG,"[5AC,5JA,YBG,YBL,YXHB,YZG,YZS]",de,0.879472
4,56300,40250,0.6665,Die Maus - Meine Kindergarten-Freunde,Unknown,Ars Edition GmbH,Y,"[5A,5AC,YB,YF]",de,0.900756,Meine Kindergarten-Freunde (Pirat),Unknown,Ars Edition GmbH,YB,"[5AC,5AD,YBG,YBL,YF]",de,0.778105
5,77134,7989,0.6476,Roxie and the Hooligans,Phyllis Reynolds Naylor,ALADDIN,YFC,"[YFQ,YXQ]",en,0.677107,Eloise and the Snowman,"Kay Thompson, Hilary Knight",ALADDIN,YFQ,[],en,0.90363
6,77134,68886,0.6262,Roxie and the Hooligans,Phyllis Reynolds Naylor,ALADDIN,YFC,"[YFQ,YXQ]",en,0.677107,The Witch Herself,Phyllis Reynolds Naylor,ALADDIN,YFCF,[YFD],en,0.670286
7,77134,8101,0.6157,Roxie and the Hooligans,Phyllis Reynolds Naylor,ALADDIN,YFC,"[YFQ,YXQ]",en,0.677107,Roxie and the Hooligans at Buzzard's Roost,Phyllis Reynolds Naylor,ATHENEUM BOOKS,YFCF,[YFQ],en,0.855458
8,77134,33814,0.6023,Roxie and the Hooligans,Phyllis Reynolds Naylor,ALADDIN,YFC,"[YFQ,YXQ]",en,0.677107,The Thief and the Beanstalk,P. W. Catanese,ALADDIN,YFH,[YFJ],en,0.924328
9,77134,70192,0.6002,Roxie and the Hooligans,Phyllis Reynolds Naylor,ALADDIN,YFC,"[YFQ,YXQ]",en,0.677107,Pongwiffy and the Goblins' Revenge,Kaye Umansky,ALADDIN,YFQ,[YFH],en,0.889788


In [21]:
sanity_df.columns

Index(['test_id', 'rec_id', 'score', 'title', 'author', 'publisher',
       'main_topic', 'subtopics', 'language', 'language_score', 'title_rec',
       'author_rec', 'publisher_rec', 'main_topic_rec', 'subtopics_rec',
       'language_rec', 'language_score_rec'],
      dtype='object')

In [24]:
sanity_df[['test_id', 'rec_id']].rename(
    columns={"test_id": "test_itemID", "rec_id": "rec_itemID"}
).to_csv('item_similarity_recom.csv', index=False)

pd.read_csv(f"item_similarity_recom.csv")

Unnamed: 0,test_itemID,rec_itemID
0,12,65244
1,12,72230
2,12,28265
3,12,75432
4,12,32703
...,...,...
4995,79016,77768
4996,79016,53240
4997,79016,22802
4998,79016,46681


In [28]:
pd.read_csv('test_recommendations.csv').tail(15)

Unnamed: 0,test_id,rec_id,score,title,author,publisher,main_topic,subtopics,language,language_score,title_rec,author_rec,publisher_rec,main_topic_rec,subtopics_rec,language_rec,language_score_rec
14985,79016,77768,0.8023,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Colors Book,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.767845
14986,79016,53240,0.7738,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting And Colors,Speedy Publishing Llc,Speedy Publishing Books,YBLC,[],en,0.656508
14987,79016,22802,0.7718,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Book For Toddlers,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.859498
14988,79016,46681,0.7611,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Book For Big Kids,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.726456
14989,79016,6624,0.7469,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Bears and Cubs,Speedy Publishing Llc,Speedy Publishing Books,YBLC,[],en,0.965962
14990,79016,16151,0.7347,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Apples and Oranges,Speedy Publishing Llc,Speedy Publishing Books,YBLC,[],en,0.855112
14991,79016,48308,0.727,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Activities For Kids,Speedy Publishing Llc,Speedy Publishing Books,YBLC,[],en,0.827514
14992,79016,1827,0.7156,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,All about Me (Girl Diary),Speedy Publishing Llc,Speedy Publishing LLC,YBL,[],en,0.859941
14993,79016,64452,0.7102,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Counting Book For Big Kids,Speedy Publishing Llc,Speedy Publishing Books,YBLC,[],en,0.726456
14994,79016,56257,0.7037,Counting Dinosaurs,Speedy Publishing Llc,Speedy Publishing LLC,YBLC,[],en,0.748285,Underdogs Coloring Book (Dogs and Pups),Speedy Publishing Llc,Speedy Publishing LLC,YBL,[],en,0.890895


# Evaluate quality

In [None]:
def evaluate_scoring(cond, scorings, n=5):
    print(scorings, sum(scorings.values()))
    test_result_df = generate_recommendations(test_cases=test[cond_test], n=n, scorings=scorings)
    test_result_df_with_details = save_recommendations(test_result_df, prefix=f'{base_path}result/dummy')
    
    return test_result_df_with_details

## ItemID = 14015

### Current wieghts

In [None]:
# Current evaluation scoring weights
cond_test = test.itemID == 14015
scorings={
    'sim_author': 0.2,
    'sim_title': 0.3,
    'sim_main_topic': 0.3,
    'sim_lang': 0.2,
}
evaluate_scoring(cond_test, scorings)

In [None]:
evaluate_scoring(cond_test, scorings, n=20)

### Alternative scorings

In [None]:
cond_test = test.itemID == 14015
scorings={
    'sim_author': 0.20,
    'sim_title': 0.60,
    'sim_lang': 0.15,
    'sim_main_topic': 0.05,
}
# We should consider publishers, there are more comics from STONE ARCH BOOKS
evaluate_scoring(cond_test, scorings)

In [None]:
evaluate_scoring(cond_test, scorings, n=20)

In [None]:
orig_items[
    orig_items.publisher == 'STONE ARCH BOOKS'
].title.tolist()

In [None]:
test.main_topic.str[0].unique()

In [None]:
orig_items['main topic'].str[0].unique()

In [None]:
orig_items[orig_items['main topic'].str[0] == 'L']