In [2]:
import pandas as pd
import datetime
import pytz
from operator import itemgetter
import fasttext
import csv
from difflib import SequenceMatcher
import numpy as np
from numpy.random import seed
from numpy.random import rand

pd.set_option('display.max_columns', None)

base_path = "./"

In [2]:
sim_df = pd.read_csv("./train_data_with_sim.csv").set_index("itemID_test", drop=False)

best_5_trans = pd.read_csv(f"./data/best_5.csv").set_index("itemId")

In [3]:
best_5_trans.loc[817.0]['rec'].tolist()

[40176.0, 45060.0, 45872.0, 57687.0, 60257.0]

In [4]:
best_5_trans.head(10)

Unnamed: 0_level_0,rec
itemId,Unnamed: 1_level_1
817.0,40176.0
817.0,45060.0
817.0,45872.0
817.0,57687.0
817.0,60257.0
1168.0,6128.0
1168.0,26028.0
1168.0,37813.0
1168.0,41877.0
1168.0,50925.0


In [5]:
sim_df.head(5)
[21310.0, 73018.0, 19194.0, 40250.0, 46107.0]

[21310.0, 73018.0, 19194.0, 40250.0, 46107.0]

In [25]:
columns = ['a', 'b']
weights = [1, 2]

{key: value for (key, value) in zip(columns, weights)}


{'a': 1, 'b': 2}

In [None]:
def get_rand_weights(n):
    weights = rand(n)
#     weights=np.array([0.3, 0.2, 0.2, 0.3])
    sum_weights = sum(weights)
    norm_weights = weights / sum_weights
    
    return norm_weights

def get_score(item, w_c):
    score = 0
    for weight, column in w_c:
        product = weight * item[column]
        score = score + product
        
    return score

def evaluate(orig_recoms, gen_recoms):
    
#     print(f'\tOrig Ids = {orig_recoms}\n\tRec Ids = {gen_recoms}')
    
    common_items = set(orig_recoms).intersection(set(gen_recoms))
    
    score = len(common_items)
    
#     print(f'\tCommon items: {score}')
    return score


def test(test_ids, w_c, n=5):
    total_eval_score = 0
    
    for test_id in test_ids:
#         print(f'Evaluation test id: {test_id}\n')
        train = sim_df.loc[test_id]
        scores = []
        for index, row in train.iterrows():
            score = get_score(row, w_c)
            scores.append({
                'rec_id': row['itemID_orig'],
                'score': score,
            })
        
        top_n = sorted(scores, key=itemgetter('score'), reverse=True)[:n]
        top_n_rec_id = [ n['rec_id'] for n in top_n ]
        
        eval_score = evaluate(
            orig_recoms=best_5_trans.loc[test_id]['rec'].tolist(), # TODO: Can avoid creating list everytimne
            gen_recoms=top_n_rec_id
        )
        
        total_eval_score = total_eval_score + eval_score
    
    
    avg_score = total_eval_score / len(test_ids)
    
    return avg_score


def optimize(test_ids, no_iter=2):
    features = ['sim_title', 'sim_author', 'sim_lang', 'sim_publisher', 'sim_mainTopic']
    f_size = len(features)
    tot_iter = no_iter
    
    best_score=1.35
    
    weights_with_score = []
    while(no_iter > 0):
        weights = get_rand_weights(f_size)
        w_c = list(zip(weights, features))
        
        w_c_dict = {key: value for (key, value) in zip(features, weights)}
#         print(f'Trying combinations: {w_c_dict}')
    
        score = test(test_ids, w_c)
        
#         print(f'\tGot score: {score}')
        
        w_c_dict['score'] = score
        weights_with_score.append(w_c_dict)
        
        no_iter = no_iter - 1
        
        if(score>=best_score):
            print(f'Combinations: {w_c_dict}')
            print(f'Found in iteration: {tot_iter - no_iter}')
            best_score=score
    
    return weights_with_score

# 17 items
interest_ids = [
   27903, 51133, 11609,  8318, 24603, 15767, 77956, 39308,
   28844, 56282, 817, 71784, 41648, 61593, 61125, 21497,
   51443
]
test_ids = interest_ids
# test_ids = sim_df.itemID_test.unique()

weights_with_score = optimize(test_ids, no_iter=2000)

# 0 - 5 score, 5 means all match, 0 means no match
pd.DataFrame(weights_with_score)

Combinations: {'sim_title': 0.44539642992930695, 'sim_author': 0.1811377104785857, 'sim_lang': 0.08914077866552864, 'sim_publisher': 0.28106675008179, 'sim_mainTopic': 0.0032583308447887336, 'score': 1.3529411764705883}
Found in iteration: 179
Combinations: {'sim_title': 0.2880618955299064, 'sim_author': 0.16485819515896538, 'sim_lang': 0.2056062788317603, 'sim_publisher': 0.3392215524836328, 'sim_mainTopic': 0.002252077995735114, 'score': 1.3529411764705883}
Found in iteration: 1167
Combinations: {'sim_title': 0.4488124693639955, 'sim_author': 0.07931618373976651, 'sim_lang': 0.033701094979653325, 'sim_publisher': 0.4296711196830966, 'sim_mainTopic': 0.008499132233488008, 'score': 1.3529411764705883}
Found in iteration: 1504


# Weights found so far

default 0.3,0.2,0.2,0.3 gives 1.35

Trying combinations: {'sim_title': 0.054073284352889384, 'sim_author': 0.7851049324817732, 'sim_lang': 0.5915191750574874, 'sim_publisher': 0.5407117177241336}
	Got score: 1.1764705882352942
Trying combinations: {'sim_title': 0.3758848200149616, 'sim_author': 0.3743017618277995, 'sim_lang': 0.6808498026325662, 'sim_publisher': 0.8222389929077665}
	Got score: 1.2352941176470589
Trying combinations: {'sim_title': 0.9420405541118271, 'sim_author': 0.07735137025517957, 'sim_lang': 0.244343637953351, 'sim_publisher': 0.27282294928471773}
	Got score: 1.0588235294117647
Trying combinations: {'sim_title': 0.7661827322387778, 'sim_author': 0.3919521836823401, 'sim_lang': 0.17557108142049027, 'sim_publisher': 0.1741377537215386}
	Got score: 1.2352941176470589
Trying combinations: {'sim_title': 0.2002006161924058, 'sim_author': 0.5735477108252497, 'sim_lang': 0.8302436363185348, 'sim_publisher': 0.1406282197715728}
	Got score: 1.0
Trying combinations: {'sim_title': 0.23053689664614396, 'sim_author': 0.6441760206271187, 'sim_lang': 0.346272601577822, 'sim_publisher': 0.1550816744696888}
	Got score: 1.2941176470588236
Trying combinations: {'sim_title': 0.3107973473412178, 'sim_author': 0.9428192648226833, 'sim_lang': 0.6012854912648783, 'sim_publisher': 0.8357129296230348}
	Got score: 1.2941176470588236
Trying combinations: {'sim_title': 0.24013393713722053, 'sim_author': 0.6136094171259088, 'sim_lang': 0.5802855651370925, 'sim_publisher': 0.7510439496614834}

In [None]:
pd.DataFrame(weights_with_score).to_csv("W_SC_11_6.csv")

In [3]:
clusters = pd.read_pickle("./main_topic_clusters.pkl")

In [7]:
clusters[clusters.main_topic_cluster==3].main_topic_split.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 1709, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[F, M]       6344
[F, M, X]     808
[F, M, H]     665
[F, M, W]     422
[F, M, T]     299
[F, M, M]     257
[F, M, K]     256
[F, K, M]     147
[F, X, M]      70
[X, Q, M]      23
[F, J, M]      13
[5, A, M]       2
[F, N, M]       1
[P, G, M]       1
[J, M, U]       1
Name: main_topic_split, dtype: int64