In [1]:
import pandas as pd
import datetime
import pytz
from operator import itemgetter
import fasttext
import csv
from difflib import SequenceMatcher

base_path = "../"

In [18]:
orig_items = pd.read_csv(
    f"{base_path}data/items.csv", 
    header=0, encoding='utf-8', sep='|', quoting=csv.QUOTE_NONE
)
orig_items['author'] = orig_items['author'].fillna('Unknown')

test = pd.read_csv(f"{base_path}data/evaluation.csv").join(orig_items.set_index('itemID'), on='itemID').rename(columns={"main topic": "main_topic"})

train_data = pd.read_csv(f"{base_path}data/train_data.csv")
# loading train items without any duplicates
items = pd.read_pickle(f"{base_path}data/items_v2.pkl").rename(columns={"main topic": "main_topic"})

print(f'Item columns {items.columns.tolist()}')

Item columns ['itemID', 'title', 'author', 'publisher', 'main_topic', 'subtopics']


# Detect language and score

In [19]:
model = fasttext.load_model('../fast_text_models/lid.176.bin')

def langdetect_score(title):
        title = title.replace('\n','')
        score = model.predict(title)[1][0]
        
        return score

def langdetect_lang(title):
        title = title.replace('\n','')
        language = str(model.predict(title)[0])

        return language

def trim(language):
    language = language.replace(language[:11], '')
    language = language[:-3]
    
    return language




In [20]:
items['language'] = items['title'].apply(langdetect_lang).apply(trim)
items['language_score'] = items['title'].apply(langdetect_score)

# items.to_csv("./item_with_lang.csv")

test['language'] = test['title'].apply(langdetect_lang).apply(trim)
test['language_score'] = test['title'].apply(langdetect_score)

# Generate item-item similarity scores

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()



In [21]:
j_train = train_data.join(orig_items.set_index('itemID'), on='itemId').join(orig_items.set_index('itemID'), on='rec', rsuffix='_rec')

In [29]:
train_item_ids = train_data.itemId.unique()
df1 = orig_items[orig_items.itemID.isin(train_item_ids)].copy()
df2 = orig_items.copy()

df1['key'] = 0
df2['key'] = 0

sim_df = df1.merge(df2, on='key', how='outer', suffixes=['_test', '_orig'])
print(len(sim_df), len(orig_items) * len(train_item_ids))

18173488 18173488


In [30]:
sim_df.columns

Index(['itemID_test', 'title_test', 'author_test', 'publisher_test',
       'main topic_test', 'subtopics_test', 'key', 'itemID_orig', 'title_orig',
       'author_orig', 'publisher_orig', 'main topic_orig', 'subtopics_orig'],
      dtype='object')

# Generate recommendations

In [None]:
def measure_similarity(book1, book2, scorings):
    
    sim_title = similar(book1.title,book2.title)
    
    if(sim_title == 1):
        return 1
    
    sim_author = similar(book1.author, book2.author)
    
    if(sim_author < 0.8 or book1.author == "Unknown" or book2.author == "Unknown"):
        sim_author = 0
    
    if(similar(book1.language, book2.language) == 1):
        sim_lang = book1.language_score
    else:
        sim_lang = 0
    
    
    sim_main_topic = similar(book1.main_topic, book2.main_topic)
    
    score = (
        (sim_author * scorings['sim_author']) + 
            (sim_main_topic * scorings['sim_main_topic']) + 
            (sim_title * scorings['sim_title']) + 
            (sim_lang * scorings['sim_lang'])
    )
#     score = sim_author * 0.2 + sim_main_topic * 0.3 + sim_title * 0.3 + sim_lang * 0.2
    
    return score


def get_top_n_recommendation(test_case, items, n, scorings):
    recommendations = []
    for index, item in items.iterrows():
        score = measure_similarity(item, test_case, scorings)
        if(score < 1.0):
            recommendations.append(
                {
                    'test_id': test_case.itemID,
                    'rec_id': index,
                    'score': round(score, 4)
                }
            )
            
    
    top_n = sorted(recommendations, key=itemgetter('score'), reverse=True)[:n]

    return top_n


def save_recommendations(result_df, prefix=None):
    # Saving recommendations
    if prefix is None:
        berlin_now = datetime.datetime.now(pytz.timezone('Europe/Berlin'))
        date_time = berlin_now.strftime("%d_%m_%Y_%H_%M_%S")
        prefix = f'{base_path}result/{date_time}'

    result_file_name_1 = f'{prefix}_recommendation_ids.csv'
    result_file_name_2 = f'{prefix}_recommendations.csv'
    
    result_df.to_csv(result_file_name_1, index=False)
    
    result_df_with_details = result_df.join(
        orig_items.set_index('itemID'),
        on='test_id',
        rsuffix='_test'
    ).join(
        orig_items.set_index('itemID'),
        on='rec_id',
        rsuffix='_rec'
    ).sort_values(by=['test_id', 'score'], ascending=[True, False])
    
    result_df_with_details.to_csv(result_file_name_2, index=False)


    print(f'Recommendations generated successfully. Files: \n\t{result_file_name_1}\n\t{result_file_name_2}')
    
    return result_df_with_details


def generate_recommendations(test_cases, n=30, scorings={
    'sim_author': 0.2,
    'sim_title': 0.3,
    'sim_main_topic': 0.3,
    'sim_lang': 0.2,
}):
    result = { 
        'test_id': [],
        'rec_id': [],
        'score': []
    }
    
    counter = 0
    for index, test_case in test_cases.iterrows():
        counter = counter + 1
        print(f"({counter}) Working on the test item: id = {test_case['itemID']}, title = {test_case['title']}")
        recommendations = get_top_n_recommendation(test_case, items, n, scorings)

        for recommendation in recommendations:
            result['test_id'].append(recommendation['test_id'])
            result['rec_id'].append(recommendation['rec_id'])
            result['score'].append(recommendation['score'])


    result_df = pd.DataFrame(result).sort_values(by=['test_id', 'score'], ascending=[True, False])
    
    print(f'\n{n} recommendations generated for each of the {len(test_cases)} items\n')
    
    return result_df

In [None]:
result_df = generate_recommendations(test_cases=test.head(5), n=5)

result_df_with_details = save_recommendations(result_df, prefix=f'{base_path}result/test')

print(result_df.test_id.nunique(), len(result_df))
print(result_df_with_details.test_id.nunique(), len(result_df_with_details))

In [None]:
# sanity_df = pd.read_csv(f"{base_path}result/test_recommendations.csv")
# sanity_df

# Evaluate quality

In [None]:
def evaluate_scoring(cond, scorings, n=5):
    print(scorings, sum(scorings.values()))
    test_result_df = generate_recommendations(test_cases=test[cond_test], n=n, scorings=scorings)
    test_result_df_with_details = save_recommendations(test_result_df, prefix=f'{base_path}result/dummy')
    
    return test_result_df_with_details

## ItemID = 14015

### Current wieghts

In [None]:
# Current evaluation scoring weights
cond_test = test.itemID == 14015
scorings={
    'sim_author': 0.2,
    'sim_title': 0.3,
    'sim_main_topic': 0.3,
    'sim_lang': 0.2,
}
evaluate_scoring(cond_test, scorings)

In [None]:
evaluate_scoring(cond_test, scorings, n=20)

### Alternative scorings

In [None]:
cond_test = test.itemID == 14015
scorings={
    'sim_author': 0.20,
    'sim_title': 0.60,
    'sim_lang': 0.15,
    'sim_main_topic': 0.05,
}
# We should consider publishers, there are more comics from STONE ARCH BOOKS
evaluate_scoring(cond_test, scorings)

In [None]:
evaluate_scoring(cond_test, scorings, n=20)

In [None]:
orig_items[
    orig_items.publisher == 'STONE ARCH BOOKS'
].title.tolist()

In [None]:
test.main_topic.str[0].unique()

In [None]:
orig_items['main topic'].str[0].unique()

In [None]:
orig_items[orig_items['main topic'].str[0] == 'L']