# STARSHIP - PART 3

In [1]:
import numpy as np
import pandas as pd

y_train=pd.read_csv('y_train_data.csv')
X_train=pd.read_csv('X_train_data.csv')
y_test=pd.read_csv('y_test_data.csv')
X_test=pd.read_csv('X_test_data.csv')

## Prepare 2 test set: a. All users with 0 and 1 clicks. B. all users with 2 and more clicks

In [5]:
test_set = pd.concat([X_test, y_test], axis=1)
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
test_set_b = test_set.loc[test_set['user_clicks']>1] #more then 1 click

In [6]:
# change the indexes 0,1,2...
test_set_a = test_set_a.set_index(pd.Index(np.arange(len(test_set_a))))
test_set_b = test_set_b.set_index(pd.Index(np.arange(len(test_set_b))))

In [7]:
del X_train['user_id']
del test_set_a['user_id']
del test_set_b['user_id']

In [8]:
original_y = test_set_a['is_click']
del test_set_a['is_click']
del test_set_b['is_click']

In [2]:
def evaluate(test_set):
    y_pred = cat_model.predict(test_set)
    print("Accuracy:",metrics.accuracy_score(original_y, y_pred))
    print("Recall:",metrics.recall_score(original_y, y_pred))
    print("F-Measure:",metrics.f1_score(original_y, y_pred))
    print("Precision:",metrics.precision_score(original_y, y_pred))

In [7]:
from sklearn import metrics
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier()
cat_model.fit(X_train  , y_train.values.ravel(),verbose=False)
evaluate(test_set_a)

Accuracy: 0.7790582963775665
Recall: 0.7546308043643746
F-Measure: 0.7258416339978949
Precision: 0.6991683564018925


## For each of the users from data set a find it’s closest neighbour in data set b

In [2]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin
import time
import pickle

def dist_dict(above,metric):
    test_set = pd.concat([X_test, y_test], axis=1)
    test_set['user_id']= test_set['user_id'].astype("category").cat.codes

    test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
    test_set_b = test_set.loc[test_set['user_clicks'] >= above] #more then above
    
    # change the indexes 0,1,2...
    test_set_a = test_set_a.set_index(pd.Index(np.arange(len(test_set_a))))
    test_set_b = test_set_b.set_index(pd.Index(np.arange(len(test_set_b))))
        
    closest_dict = {}
    index = 1000
    indexes_a = test_set_a.index
    indexes_b = test_set_b.index

    
    for i in range(int(len(test_set_a)/index)+1):
        inserted=test_set_a.iloc[i*index:(i+1)*index]
        close_record_index = pairwise_distances_argmin(inserted,test_set_b,metric=metric)
        for j in range(len(close_record_index)):
            closest_dict[i*index+j] = indexes_b[close_record_index[j]]
        
    
    dict_full_name = metric + 'from' + str(above) +'_dict.pkl'
    with open(dict_full_name, 'wb') as f:
        pickle.dump(closest_dict, f)

    closest_dict

### cosine similarity

In [85]:
dist_dict(2,'cosine')
dist_dict(3,'cosine')
dist_dict(4,'cosine')
dist_dict(5,'cosine')

### jaccard_similarity

In [None]:
dist_dict(2,'hamming')
dist_dict(3,'hamming')
dist_dict(4,'hamming')
dist_dict(5,'hamming')

### euclidean similarity

In [None]:
dist_dict(2,'euclidean')
dist_dict(3,'euclidean')
dist_dict(4,'euclidean')
dist_dict(5,'euclidean')

### manhattan similarity

In [None]:
dist_dict(2,'manhattan')
dist_dict(3,'manhattan')
dist_dict(4,'manhattan')
dist_dict(5,'manhattan')

## use the closes neighbour to improve results

In [9]:
history_without_recs = ['user_clicks',   '61108514445dc5a3d61c81143f57fc213d1feeaf861f90594866145966fdc4f9_c_ctr',
       'de4d57b401ad9cbe913238688e5ab1f7ddcff1bd37538e9d529223dfe9d79576_c_ctr',
       '48509a07538677f82f9e1ade5c0d724fe1fd837343cd6bb0431b80a2520193f3_c_ctr',
       'a7c68f40778fdf5ab08ca67bcf62cc67427723c689fa7e75903ffe984169726b_c_ctr',
       '812df4136a77ca7a4ff72ab9e4a4f2deabeeb21817f6e25718582fd412d8e8e8_c_ctr',
       'f88b356a9face082abc26d4e75b829259cf770cc3fa2401bc152917d5aa66e7d_c_ctr',
       '6ad4d921f45ab9a3de6840e4ec06f625763eec88c8ed104bfd22a576c34ff039_c_ctr',
       'b5610de1f6ffecc890aa0a487b94a60ebc5f3190a6b8df7a773b87059826ab47_c_ctr',
       'bd9124bd238e8ea1d7472beb1f8bed2096e7d2ccaef939e348f80f95b01ba5b_c_ctr',
       '68bf0109c6f8a87bfa96d1ecd9503326ba35cf4a6086988a1d2c8a20fbef00d1_c_ctr',
       '3be1d422a46085e17f4a19d3ed707c866927a6d354a562c45a78f1f03d7c4118_c_ctr',
       '11d40417959631d3d2420e8cd8709893c11cd7a4db737af63e8d56cfa7866f85_c_ctr',
       '1180412af43d78604a73f14d5a7ef83d3d3a74251ce8fb7b232043a511fbf161_c_ctr',
       '4263e28fc51808c7cc9a9540585fb69080542158e6dcc9910d05f8458f59ab49_c_ctr',
       'c513f6ef979bc9ca96e48c44a4e8fe756f4790570949e452e37df21e73567ef2_c_ctr',
       '0c350ae55158adeee57e63c575f9ff1975aeca281cf1e052a5cc3c18ada7c0ad_c_ctr',
       'd7efa2f7a9e1b512fd646d3e89e22e529f2b1e852a2864d654d9a867ca4a4624_c_ctr',
       '8e4b5bcfdfdc849677ee6d7a4b5f5d6c83eaf6619abd87bca4b3106b3722678_c_ctr',
       '0209442e115ad7bc79fd281d91423a86b619e3c711fe574b7cc198d2e3c461c4_c_ctr',
       '28cec6db11ab7d5a0892c361b953500bc4b30cb96906b95fc346f6e0c3cd0e55_c_ctr',
       '8a91bb34721c6489a602f83428cff4adf368331fb8e7435dd8dc993cf482bfd5_c_ctr',
       'a17498f58d835f26f28be6e80da34f8e4d80ab429125db50197a0ba7fe619f49_c_ctr',
       '1cb542228c76558789d114d3cb273a75850cca54ec3ee9a41100f2dc56ee561e_c_ctr',
       'c0738849f57726a1e60cb17d93157b5b6ccf6f3bb7fdf6bbed57d84c9b23b31d_c_ctr',
       'a031eda273ac18861d11324af880859527405b4d475a9ac589bc4c8bfd598893_c_ctr',
       '0bcc639f6c41a8b9ea537cd08d9e8f8d6908c4dd79eeb431126b2781ba57771e_c_ctr',
       '7bc70a705d85bfa59f9784b64151fb3daf051eb1744830baa8d22bbcb32956f3_c_ctr',
       '33445260df1bc49114eeeda3d7e30ea3b4ac318f5e6bb7ad339e6a5a03be9336_c_ctr',
       '4b19c051cd2a88afdf08fff3eb4bfa99adf8d603da2a1c45e0576c1739eaea15_c_ctr',
       'a941e049e9b1a5f8f3e2c1d39ee03082c184e89224e519c0bef6d8694743159a_c_ctr',
       'a0da63f7deb716b7c1ee936a1b68b25dcba9773f0fb4de5f063dc52a053945b1_c_ctr',
       '790fad4744a9da8d301e23c83b386f1baabd229611dbe5265df62a97b5357393_c_ctr',
       '33297ca1e5b9d63741369135c744705be15edc2d9beb1629c1a45b1f61641a59_c_ctr',
       'ba5285161ba6eed0085fb13784ce5c92f70ebc268b94fd66aa1d68a32884204d_c_ctr',
       '516d189fdc3b2d3c12953d7e007b7151ed1538e23112ef5c1b6b2dbf8af7319e_c_ctr',
       '259aa8ef98a8b91de574cd904138ef643240c23080cf24da4793a6f10a43fa9d_c_ctr',
       '0a976f55a9c56ea0081016a41c4ba048732ffbfed98de816184703b7f93c1499_c_ctr',
       '0720c198df96f189a141cec90f859b927a91275264c2cbe16eb3966a58f727f9_c_ctr',
       'cb69f81bd35ddfa40a1d1608edd4b41d644dcf27a99fe3f9e2c094ba50ea54b1_c_ctr',
       '4a8a398e6be840d4f07f10e17b1750ded029c71311c5232bfa36d354d611dba9_c_ctr',
       '1a1d1484d3cbc721070469e555f6a0cc89c03d60181e1f13694014e9249d19b6_c_ctr',
       'f2bd5a01dfff120bb2e4be9b5a74970d991e5f6aca2cddf6680a54c98c7c17d_c_ctr',
       'ca9942acaf5f9ae0a98c58ee0a1830f5c7047c465003fc24ac46df377eab8544_c_ctr',
       '2960f16ec1aa689ae008674ff33d51ef5d9b674bd90d160c34b888e5139e2009_c_ctr',
       '2096303a8868b9fa3a2ed62ce9fa0445269109f0f1750312ed419a3f74f8ac78_c_ctr',
       '19fba0e995b9794fc2c26217bf3b725c2f0d9eeda16719fe75e3ba23ca73bfc4_c_ctr',
       'd90d402744080c11741e5e10c35250c6f0a352d54fc2ec7718c208feddd7a2a2_c_ctr',
       '6fda2b8252b6d5ea9d19934d61b30d7a4a2922108fc1ecb83416001760139a39_c_ctr',
       'b537c6ef04866119578ef5f82075a5240805f9f54733b46639227110fefb413f_c_ctr',
       'b34564f1c4cd1d98dc26aaa4f888e3020656033add4dd0620b26e820493bf5c2_c_ctr',
       '61a78304b3c41ee12b310b29a17756d6d98ec5500f25df5c8d300156f0f23973_c_ctr',
       '16627e06ee5f755933fb2687ad25c3d81cb59cd8f1de7d96258509b1539cb2a8_c_ctr',
       'e2ddc2cd2e8ae62ed9e842999a8f2b74579198b3a293750ec07e36aa10491de2_c_ctr',
       'fe9bbd400bb6cb314531e3462507661401959afc69aae96bc6aec2c213b83bc1_c_ctr',
       'dd73a2f7c7982c61006be12e1bbb3e8c9ea6b6e8baf7cc5e307514015fc2fd23_c_ctr',
       'eab762a03fd979a04cc4706e6536d382bc89d2d1356afcd054a16b2235ecd471_c_ctr',
       '62484e22a6a5ade1ba25cb1b7c55c4b8861de24caddab73c9409742734008b26_c_ctr',
       '309d267f086d5fe5433d5bcc11cf83c14ffb5518dca95fc59b17363bb421e28e_c_ctr',
       '80f189984e5ca70287d13342f6daa0db45cba3c131c4e46dc81360f3a4c4f690_c_ctr',
       'd34a569ab7aaa54dacd715ae64953455d86b768846cd0085ef4e9e7471489b7b_c_ctr',
       '-4228920969204288365_i_clicks', '2149269691410270673_i_clicks',
       '-7630439413240589360_i_clicks', '7731610252078196101_i_clicks',
       '-3659437482324795455_i_clicks', '-8749875857714603232_i_clicks',
       '8677353483444746886_i_clicks', '-3165286775812642602_i_clicks',
       '-6922482799228753992_i_clicks', '-1546233820172493230_i_clicks',
       '3779093080491101819_i_clicks', '6710410937413896675_i_clicks',
       '-8837312109174350409_i_clicks', '-4352693544295965547_i_clicks',
       '6923996694200037766_i_clicks', '-4865872832970702614_i_clicks',
       '8611949271977874056_i_clicks', '-62418374600764546_i_clicks',
       '-4484468456254590098_i_clicks', '4779478477400736725_i_clicks',
       '-6987840845420875926_i_clicks', '-5827419750254943422_i_clicks',
       '101315062624410900_i_clicks', '-8742516677870340136_i_clicks',
       '27834901668906764_i_clicks', '1597082942030427249_i_clicks',
       '3747692144586145998_i_clicks', '6709311094223257746_i_clicks',
       '5531386583286579610_i_clicks', '7164253633056214_i_clicks',
       '-4537147676852755308_i_clicks', '-4798498070973557588_i_clicks',
       '-7956584263689547569_i_clicks', '6915057301283748335_i_clicks',
       '936630936040113192_i_clicks', '7017123631563780887_i_clicks',
       '2839355173630047798_i_clicks', '8951753579110056782_i_clicks',
       '7557519904060801410_i_clicks', '7521853921239608992_i_clicks',
       '-750036118207051322_i_clicks', '4405515281669991437_i_clicks',
       '-6535450805921934152_i_clicks' ]

In [10]:
def evaluate_with_replacement(ls , ts1 , ts2):
    test_set_a_replace1 = ts2.reindex(ls)
    print('evaluation after changing the all row:')
    evaluate(test_set_a_replace1)
    print()
    
    ts1.loc[:, history_without_recs] = ts2.reindex(ls).loc[:, history_without_recs].values
    print('evaluation after replacing the history:')
    evaluate(ts1)
    print()

    ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values
    print('evaluation after the replacing history and the recs:')
    evaluate(ts1)
    print()

### cosine

In [80]:
#cosine - with neighbors with 2 or more clicks
cosine_similarity = pd.read_pickle("cosine_dict.pkl") 
cosine_similarity = list(cosine_similarity.values())
evaluate_with_replacement(cosine_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4477010593474272
Recall: 0.8017397234204516
F-Measure: 0.5294635142565391
Precision: 0.39523802078039855



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6282572551622827
Recall: 0.958869893428064
F-Measure: 0.6666005165223767
Precision: 0.5108809078196359



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6957395609562708
Recall: 0.8712731540218219
F-Measure: 0.6894111225094821
Precision: 0.5703592581249644



In [None]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

In [12]:
#cosine - with neighbors with 3 or more clicks
cosine_similarity = pd.read_pickle("cosinefrom3_dict.pkl") 
cosine_similarity = list(cosine_similarity.values())
evaluate_with_replacement(cosine_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4196884344584482
Recall: 0.8111995686374017
F-Measure: 0.5200505298260686
Precision: 0.38269632910492707

evaluation after replacing the history:
Accuracy: 0.6206571211688164
Recall: 0.9580452296371479
F-Measure: 0.6618950574674862
Precision: 0.5056033544801728

evaluation after the replacing history and the recs:
Accuracy: 0.6858529329510217
Recall: 0.8750555062166963
F-Measure: 0.6834608119406682
Precision: 0.5606956680791391



In [13]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

In [14]:
#cosine - with neighbors with 4 or more clicks
cosine_similarity = pd.read_pickle("cosinefrom4_dict.pkl") 
cosine_similarity = list(cosine_similarity.values())
evaluate_with_replacement(cosine_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42534935108439437
Recall: 0.8205087541233189
F-Measure: 0.525342884775992
Precision: 0.38635676281153736



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6181585732769086
Recall: 0.9585685739659985
F-Measure: 0.6605467992273667
Precision: 0.5038868904459608



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6895408265184133
Recall: 0.8798369703121035
F-Measure: 0.6871825996482275
Precision: 0.5637422265577369



In [15]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

In [16]:
#cosine - with neighbors with 5 or more clicks
cosine_similarity = pd.read_pickle("cosinefrom5_dict.pkl") 
cosine_similarity = list(cosine_similarity.values())
evaluate_with_replacement(cosine_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4271687119109743
Recall: 0.8218488327835575
F-Measure: 0.5265403890947794
Precision: 0.3873551319089139



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6187916616726441
Recall: 0.9571967774676478
F-Measure: 0.6605977524769542
Precision: 0.5043261738741712



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6918365402641147
Recall: 0.880748858157828
F-Measure: 0.6889978568260556
Precision: 0.5658127320981932



### jaccard

In [82]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

In [64]:
jaccard_similarity = pd.read_pickle("jaccard_dict.pkl") 
jaccard_similarity = list(jaccard_similarity.values())
evaluate_with_replacement(jaccard_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.6967875373783379
Recall: 0.8684899137274803
F-Measure: 0.6894648050460159
Precision: 0.5716321162397445



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


evaluation after replacing the history:
Accuracy: 0.6773185325871496
Recall: 0.9538584750063436
F-Measure: 0.6961742909807486
Precision: 0.5481043053524156



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.7427325447387588
Recall: 0.8968218726211621
F-Measure: 0.7298846124061024
Precision: 0.6153427638737758



In [25]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

jaccard_similarity = pd.read_pickle("hamming_from_3_dict.pkl") 
jaccard_similarity = list(jaccard_similarity.values())
evaluate_with_replacement(jaccard_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42874221316639466
Recall: 0.820921086018777
F-Measure: 0.5269445229133423
Precision: 0.3879996252225241



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6141111100866963
Recall: 0.95476243339254
F-Measure: 0.6572829154588976
Precision: 0.5011404027203183



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6931887679054916
Recall: 0.8746907510784065
F-Measure: 0.6884609503481032
Precision: 0.567611236035999



In [26]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

jaccard_similarity = pd.read_pickle("hamming_from_4_dict.pkl") 
jaccard_similarity = list(jaccard_similarity.values())
evaluate_with_replacement(jaccard_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4311239777620018
Recall: 0.8186294722151738
F-Measure: 0.527289164010787
Precision: 0.38888847034715524



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6294066486574531
Recall: 0.956839951788886
F-Measure: 0.6668177108516958
Precision: 0.5117147909572416



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6937788308762742
Recall: 0.8772281781273789
F-Measure: 0.6894942022256224
Precision: 0.5679485797306801



In [27]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

jaccard_similarity = pd.read_pickle("hamming_from_5_dict.pkl") 
jaccard_similarity = list(jaccard_similarity.values())
evaluate_with_replacement(jaccard_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4336563313449441
Recall: 0.8090744734838874
F-Measure: 0.525474443156946
Precision: 0.3890893005590342



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6309401977325602
Recall: 0.958457561532606
F-Measure: 0.6681130020948833
Precision: 0.5127777636368265



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.7020366392225921
Recall: 0.8844757041360061
F-Measure: 0.6970566179227596
Precision: 0.5751773854213935



### euclidean

In [42]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

euclidean_similarity = pd.read_pickle("euclidean_dict.pkl") 
euclidean_similarity = list(euclidean_similarity.values())
evaluate_with_replacement(euclidean_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.5390901351920316
Recall: 0.8356857396599848
F-Measure: 0.5842746462280496
Precision: 0.4491504114761574



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


evaluation after replacing the history:
Accuracy: 0.6385372584813869
Recall: 0.958267254503933
F-Measure: 0.6726651341166779
Precision: 0.518215967273008



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_a['user_recs_v2'] =  test_set_b.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.7085119656780039
Recall: 0.8683709718345598
F-Measure: 0.6978153445205196
Precision: 0.5832583258325833



In [17]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

euclidean_similarity = pd.read_pickle("euclideanfrom3_dict.pkl") 
euclidean_similarity = list(euclidean_similarity.values())
evaluate_with_replacement(euclidean_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42417537163210806
Recall: 0.8222294468409034
F-Measure: 0.5253563620795789
Precision: 0.38599096936059646



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6217450497711969
Recall: 0.9578707815275311
F-Measure: 0.6624967779794777
Precision: 0.506354635614463



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.690130889489196
Recall: 0.8796863105810708
F-Measure: 0.6875546162761151
Precision: 0.564305116127654



In [18]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

euclidean_similarity = pd.read_pickle("euclideanfrom4_dict.pkl") 
euclidean_similarity = list(euclidean_similarity.values())
evaluate_with_replacement(euclidean_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.43079514058557605
Recall: 0.8259166455214413
F-Measure: 0.529353618157747
Precision: 0.38949655407098277



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6238071969242968
Recall: 0.958061088556204
F-Measure: 0.6637623436019282
Precision: 0.5077812753474572



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6914185789931436
Recall: 0.8787744227353463
F-Measure: 0.6882259972613544
Precision: 0.5655872821455



In [19]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

euclidean_similarity = pd.read_pickle("euclideanfrom5_dict.pkl") 
euclidean_similarity = list(euclidean_similarity.values())
evaluate_with_replacement(euclidean_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42796160902796343
Recall: 0.8201677873636133
F-Measure: 0.5263752509535599
Precision: 0.38755062966251885



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.617052205206691
Recall: 0.958053159096676
F-Measure: 0.6597770381514317
Precision: 0.5031336017889639



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.690573436717283
Recall: 0.8782193605683837
F-Measure: 0.6875031037393852
Precision: 0.5648408812729498



### manhattan similarity

In [21]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

manhattan_similarity = pd.read_pickle("manhattan_from_2_dict.pkl") 
manhattan_similarity = list(manhattan_similarity.values())
evaluate_with_replacement(manhattan_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.544446800598668
Recall: 0.8277642095914742
F-Measure: 0.5848005960550566
Precision: 0.45210089128721276



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6364259394140552
Recall: 0.9581958893681807
F-Measure: 0.6713650366822878
Precision: 0.5166951011455033



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.7074271103202628
Recall: 0.8697824156305506
F-Measure: 0.6973742768135291
Precision: 0.5820086168474223



In [22]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

manhattan_similarity = pd.read_pickle("manhattan_from_3_dict.pkl") 
manhattan_similarity = list(manhattan_similarity.values())
evaluate_with_replacement(manhattan_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42781716652990726
Recall: 0.8176620781527532
F-Measure: 0.5255494169452826
Precision: 0.3872153628935352



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6214930437107585
Recall: 0.9569430347627506
F-Measure: 0.6621310962729682
Precision: 0.5061867172229819



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6913202351646798
Recall: 0.8767524105556965
F-Measure: 0.6876630625756035
Precision: 0.5656658157131382



In [23]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

manhattan_similarity = pd.read_pickle("manhattan_from_4_dict.pkl") 
manhattan_similarity = list(manhattan_similarity.values())
evaluate_with_replacement(manhattan_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.42632049639047415
Recall: 0.8219915630550622
F-Measure: 0.526214799680199
Precision: 0.38697117771564454



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6265300916748876
Recall: 0.956625856381629
F-Measure: 0.6650478077876115
Precision: 0.509693909884028



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6914247254824226
Recall: 0.8763242197411825
F-Measure: 0.687630857679732
Precision: 0.5658005887623192



In [24]:
# test set a before changing it
test_set_a = test_set.loc[test_set['user_clicks']<2] #0-1 clicks
del test_set_a ['user_id']
del test_set_a ['is_click']

manhattan_similarity = pd.read_pickle("manhattan_from_5_dict.pkl") 
manhattan_similarity = list(manhattan_similarity.values())
evaluate_with_replacement(manhattan_similarity,test_set_a, test_set_b)

evaluation after changing the all row:
Accuracy: 0.4284287422131664
Recall: 0.8172893935549354
F-Measure: 0.5257023944017566
Precision: 0.38746518001135294



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


evaluation after replacing the history:
Accuracy: 0.6218249541318237
Recall: 0.9568875285460543
F-Measure: 0.6623143543978661
Precision: 0.5064164974065433



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ts1['user_recs_v2'] =  ts2.reindex(ls)['user_recs_v2'].values


evaluation after the replacing history and the recs:
Accuracy: 0.6888677859423644
Recall: 0.8812959908652627
F-Measure: 0.6870732529062848
Precision: 0.5629980092294755



## The results
### closest neighbors from data of 2 or more clicks

| | Accuracy | Recall | F-measure | Precision | 
|----------:|:----------|:----------|:----------|:----------|
| original data 0-1 clicks | 0.7790582963775665 | 0.7546308043643746 | 0.7258416339978949 | 0.6991683564018925 |
| cosine | | | | |
| all row | 0.4477010593474272 | 0.8017397234204516 | 0.5294635142565391 | 0.39523802078039855 |
| history | 0.6282572551622827 | 0.958869893428064 | 0.6666005165223767 | 0.5108809078196359 |
|  history with recs | 0.6957395609562708 | 0.8712731540218219 | 0.6894111225094821 | 0.5703592581249644 |
| all row from 3 | 0.4196884344584482 | 0.8111995686374017 | 0.5200505298260686 | 0.38269632910492707 |
| history from 3 | 0.6206571211688164 | 0.9580452296371479 | 0.6618950574674862 | 0.5056033544801728 |
| history with recs from 3 | 0.6858529329510217 | 0.8750555062166963 | 0.6834608119406682 | 0.5606956680791391 |
| all row from 4 | 0.42534935108439437 | 0.8205087541233189 | 0.525342884775992 | 0.38635676281153736 |
| history from 4 | 0.6181585732769086 | 0.9585685739659985 | 0.6605467992273667 | 0.5038868904459608 |
| history with recs from 4 | 0.6895408265184133 | 0.8798369703121035 | 0.6871825996482275 | 0.5637422265577369 |
| all row from 5 | 0.4271687119109743 | 0.8218488327835575 | 0.5265403890947794 | 0.3873551319089139 |
| history from 5 | 0.6187916616726441 | 0.9571967774676478 | 0.6605977524769542 | 0.5043261738741712 |
| history with recs from 5 | 0.6918365402641147 | 0.880748858157828 | 0.6889978568260556 | 0.5658127320981932 |
| jaccard | | | | |
| all row | 0.6967875373783379 | 0.8684899137274803 | 0.6894648050460159 | 0.5716321162397445 |
| history | 0.6773185325871496 | 0.9538584750063436 | 0.6961742909807486 | 0.5481043053524156 |
| history with recs | 0.7427325447387588 | 0.8968218726211621 | 0.7298846124061024 | 0.6153427638737758 |
| all row from 3 | 0.42874221316639466 | 0.820921086018777 | 0.5269445229133423 | 0.3879996252225241 |
| history from 3 | 0.6141111100866963 | 0.95476243339254 | 0.6572829154588976 | 0.5011404027203183 |
| history with recs from 3 | 0.6931887679054916 | 0.8746907510784065 | 0.6884609503481032 | 0.567611236035999 |
| all row from 4 | 0.4311239777620018 | 0.8186294722151738 | 0.527289164010787 | 0.38888847034715524 |
| history from 4 | 0.6294066486574531 | 0.956839951788886 | 0.6668177108516958 | 0.5117147909572416 |
| history with recs from 4 | 0.6937788308762742 | 0.8772281781273789 | 0.6894942022256224 | 0.5679485797306801 |
| all row from 5 | 0.4336563313449441 | 0.8090744734838874 | 0.525474443156946 | 0.3890893005590342 |
| history from 5 | 0.6309401977325602 | 0.958457561532606 | 0.6681130020948833 | 0.5127777636368265 |
| history with recs from 5 | 0.7020366392225921 | 0.8844757041360061 | 0.6970566179227596 | 0.5751773854213935 |
| euclidean | | | | |
| all row | 0.5390901351920316 | 0.8356857396599848 | 0.5842746462280496 | 0.4491504114761574 |
| history | 0.6385372584813869 | 0.958267254503933 | 0.6726651341166779 | 0.518215967273008 |
| history with recs | 0.7085119656780039 | 0.8683709718345598 | 0.6978153445205196 | 0.5832583258325833 |
| all row from 3 | 0.42417537163210806 | 0.8222294468409034 | 0.5253563620795789 | 0.38599096936059646 |
| history from 3 | 0.6217450497711969 | 0.9578707815275311 | 0.6624967779794777 | 0.506354635614463 |
| history with recs from 3 | 0.690130889489196 | 0.8796863105810708 | 0.6875546162761151 | 0.564305116127654 |
| all row from 4 | 0.43079514058557605 | 0.8259166455214413 | 0.529353618157747 | 0.38949655407098277 |
| history from 4 | 0.6238071969242968 | 0.958061088556204 | 0.6637623436019282 | 0.5077812753474572 |
| history with recs from 4 | 0.6914185789931436 | 0.8787744227353463 | 0.6882259972613544 | 0.5655872821455 |
| all row from 5 | 0.42796160902796343 | 0.8201677873636133 | 0.5263752509535599 | 0.38755062966251885 |
| history from 5 | 0.617052205206691 | 0.958053159096676 | 0.6597770381514317 | 0.5031336017889639 |
| history with recs from 5 | 0.690573436717283 | 0.8782193605683837 | 0.6875031037393852 | 0.5648408812729498 |
| Manhattan | | | | |
| all row | 0.544446800598668  | 0.8277642095914742 | 0.5848005960550566 | 0.45210089128721276 |
| history | 0.6364259394140552 | 0.9581958893681807 | 0.6713650366822878 | 0.5166951011455033 |
| history with recs | 0.7074271103202628 | 0.8697824156305506 | 0.6973742768135291 | 0.5820086168474223 |
| all row from 3 | 0.42781716652990726 | 0.8176620781527532 | 0.5255494169452826 | 0.3872153628935352 |
| history from 3 | 0.6214930437107585 | 0.9569430347627506 | 0.6621310962729682 | 0.5061867172229819 |
| history with recs from 3 | 0.6913202351646798 | 0.8767524105556965 | 0.6876630625756035 | 0.5656658157131382 |
| all row from 4 | 0.42632049639047415 | 0.8219915630550622 | 0.526214799680199 | 0.38697117771564454 |
| history from 4 | 0.6265300916748876 | 0.956625856381629 | 0.6650478077876115 | 0.509693909884028 |
| history with recs from 4 | 0.6914247254824226 | 0.8763242197411825 | 0.687630857679732 | 0.5658005887623192 |
| all row from 5 | 0.4284287422131664 | 0.8172893935549354 | 0.5257023944017566 | 0.38746518001135294 |
| history from 5 | 0.6218249541318237 | 0.9568875285460543 | 0.6623143543978661 | 0.5064164974065433 |
| history with recs from 5 | 0.6888677859423644 | 0.8812959908652627 | 0.6870732529062848 | 0.5629980092294755 |
