In [60]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pathlib
import os

In [61]:
lang = "Hindi"
rating_def = "is_vp_succ2"
user_context = "location"
dtype="video"
model_path = f"./train_test_data_models/{user_context}/{dtype}/{lang}/{rating_def}/out/model.txt"
mapping_path = f"./train_test_data_models/{user_context}/{dtype}/{lang}/{rating_def}/user_post_ffm_mapping.csv"
num_nearest_neighbours = 10
num_samples = 12
RANDOM_SEED = 9745

In [17]:
model = open(model_path).readlines()
embs = {}
for s in model:
    x = s.split(': ')
    embs[x[0]] =  list(map(float, x[1].split(' ')))
del model

In [103]:
mapping = pd.read_csv(mapping_path, names=['object', 'mapping'], header=None, index_col='mapping',
                     delimiter = '\t', skiprows=1)
location_mapping = mapping[mapping['object'].str.contains('locationBucket_')]
location_mapping['location'] = mapping['object'].str.split('locationBucket_').str[-1] 
location_mapping['location_user_embs'] = location_mapping.index.map(lambda x: embs['v_' + str(x) + '_0'])
location_mapping['location_post_embs'] = location_mapping.index.map(lambda x: embs['v_' + str(x) + '_1'])
location_mapping['location_bias'] = location_mapping.index.map(lambda x: embs['i_' + str(x)][0])
location_mapping = location_mapping[['location','location_post_embs', 'location_user_embs', 'location_bias']]

  mask |= (ar1 == a)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_mapping['location'] = mapping['object'].str.split('locationBucket_').str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_mapping['location_user_embs'] = location_mapping.index.map(lambda x: embs['v_' + str(x) + '_0'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [129]:
temp_location_mapping = location_mapping[location_mapping.location.str.contains("maharashtra|bihar|uttar pradesh|punjab|himachal|madhya pradesh|haryana|delhi|rajasthan")]
sampled_locations = temp_location_mapping.sample(n = num_samples, random_state=RANDOM_SEED)
# sampled_locations = location_mapping[location_mapping.location.str.contains("maharashtra")]

In [130]:
location_emb_mat = np.array(location_mapping.location_post_embs.to_list())
res = cosine_similarity(sampled_locations.location_post_embs.to_list(), location_emb_mat)
top_n_indices = [(-cos_sim).argsort()[:num_nearest_neighbours] for cos_sim in res]
top_n_vals = [res[i, inds] for i, inds in enumerate(top_n_indices)]
sampled_locations["similar_locations"] = [location_mapping.iloc[inds].location.to_list() for inds in top_n_indices]
sampled_locations["cosine_similarity"] = top_n_vals

In [131]:
rows_dict_list= []

for i,location in enumerate(sampled_locations.location.to_list()):
    inds = top_n_indices[i]
    similar_locations = location_mapping.iloc[inds].location.to_list()
    similar_locations_scores = top_n_vals[i]
    for n in range(len(similar_locations)):
        row_dict = {
            "Rating Def": rating_def,
            "Language": lang,
        }
        row_dict["location"] = location
        row_dict["Similar location"] = similar_locations[n]
        row_dict["Cosine Similarity"] = similar_locations_scores[n]
        rows_dict_list.append(row_dict)
        
save_df = pd.DataFrame(rows_dict_list)
save_df_path = f"./neighbourhood_embeddings_data/{user_context}/{dtype}/{lang}/{rating_def}"
pathlib.Path(save_df_path).mkdir(parents = True, exist_ok = True)
save_df_path = os.path.join(save_df_path, "similar_locations_samples.csv")
save_df.to_csv(save_df_path, index=False)        

## Scrap Code

In [125]:
sampled_locations.location.values

array(['ghaziabad_bihar', 'gurgaon_haryana', 'bhind_madhya pradesh',
       'south delhi_mexico city', 'sirsa_punjab',
       'faizabad_uttar pradesh', 'ludhiana_uttar pradesh',
       'garhwa_uttar pradesh', 'south west delhi_fujairah',
       'dadra and nagar haveli_maharashtra', 'sitamarhi_bihar',
       'north delhi_sharjah'], dtype=object)

In [120]:
sampled_locations.similar_locations.head(5)

AttributeError: 'DataFrame' object has no attribute 'similar_locations'

In [107]:
location_mapping[location_mapping.location.str.contains("maharashtra")].location.values

array(['agra_uttar pradesh', 'ahmedabad_uttar pradesh',
       'alawal_uttar pradesh', 'aligarh_uttar pradesh',
       'ambedkar nagar_uttar pradesh', 'amethi_uttar pradesh',
       'amritsar_uttar pradesh', 'amroha_uttar pradesh',
       'anshik_uttar pradesh', 'asarfpur_uttar pradesh',
       'ashoknagar_uttar pradesh', 'auraiya_uttar pradesh',
       'azamgarh_uttar pradesh', 'badaun_uttar pradesh',
       'bagpat_uttar pradesh', 'bahera_uttar pradesh',
       'bahraich_uttar pradesh', 'baku_uttar pradesh',
       'balasore_uttar pradesh', 'ballia_uttar pradesh',
       'balrampur_uttar pradesh', 'banda_uttar pradesh',
       'bangalore urban_uttar pradesh', 'barabanki_uttar pradesh',
       'bareilly_uttar pradesh', 'basti_uttar pradesh',
       'belagavi_uttar pradesh', 'bhalau_uttar pradesh',
       'bhanpuer_uttar pradesh', 'bharatpur_uttar pradesh',
       'bheri_uttar pradesh', 'bhind_uttar pradesh',
       'bhojpur_uttar pradesh', 'bhopal_uttar pradesh',
       'bijnor_uttar 

In [112]:
temp_df = sampled_locations[sampled_locations.location.str.contains("pune")]

In [113]:
print(temp_df.similar_locations.values)

[list(['pune_maharashtra', 'solapur_maharashtra', 'ahmednagar_maharashtra', 'satara_maharashtra', 'osmanabad_maharashtra', 'sangli_maharashtra', 'kolhapur_maharashtra', 'aurangabad_maharashtra', 'latur_maharashtra', 'nanded_maharashtra'])]


In [102]:
temp_df.cosine_similarity.values

array([array([1.        , 0.86240904, 0.85792183, 0.85043648, 0.84782158,
       0.84778797, 0.84426956, 0.8428945 , 0.84245285, 0.84217169]),
       array([1.        , 0.90593651, 0.89933396, 0.89873083, 0.89601722,
       0.88971562, 0.88831216, 0.88781327, 0.88622991, 0.88585101])],
      dtype=object)

In [45]:
mapping = pd.read_csv(mapping_path, names=['object', 'mapping'], header=None,
                     delimiter = '\t', skiprows=1)

In [46]:
location_mapping = mapping[mapping['object'].str.contains('locationBucket_')]

In [70]:
sampled_locations.tail(5)

Unnamed: 0_level_0,location,location_post_embs,location_user_embs,location_bias,similar_locations,cosine_similarity
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24142696,janakpur_dubai,"[0.101442, 0.0442485, 0.0358052, 0.0361507, 0....","[0.106819, 0.0342476, 0.0656946, 0.0472308, 0....",0.042278,"[janakpur_dubai, kula_manisa, al harth_jazan, ...","[1.0000000000000002, 0.9346347277511485, 0.925..."
24140514,al-muzahmiyya_riyadh province,"[0.231066, 0.296304, 0.234816, 0.254913, 0.444...","[-0.0371273, 0.083598, 0.0243366, 0.169848, -0...",-0.551625,"[al-muzahmiyya_riyadh province, sikar_assam, s...","[0.9999999999999999, 0.7769319609745171, 0.758..."
24142902,kannur_johor,"[-0.013964, 0.00548028, -0.00864858, 0.0049976...","[0.0268046, 0.044769, 0.045428, 0.051317, 0.07...",-0.244242,"[kannur_johor, saran_thimphu, kutch_al batinah...","[1.0000000000000002, 0.9081864210847753, 0.904..."
24140701,anand_kakamega county,"[0.112178, 0.25917, 0.136845, 0.199748, 0.2368...","[0.018624, 0.0265641, 0.0396445, 0.0173614, 0....",-0.252025,"[anand_kakamega county, gorod voronezh_voronez...","[1.0000000000000002, 0.8601395758193444, 0.851..."
24144380,pali_muscat governorate,"[0.0848778, 0.120216, 0.121797, 0.128381, 0.18...","[0.117159, 0.0760136, 0.0689212, 0.079329, 0.0...",0.421416,"[pali_muscat governorate, guliston tumani_sird...","[0.9999999999999998, 0.9153870306717415, 0.912..."


In [47]:
location_mapping['location'] = mapping['object'].str.split('locationBucket_').str[-1] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_mapping['location'] = mapping['object'].str.split('locationBucket_').str[-1]


In [63]:
location_mapping.head(5)

Unnamed: 0_level_0,location,location_post_embs,location_user_embs,location_bias
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24140341,(south) umzinyathi dc_kwazulu-natal,"[0.104051, 0.0576361, 0.0281409, 0.0013603, 0....","[0.101274, 0.107041, 0.0831622, 0.0182644, 0.1...",0.127011
24140342,4lm_rajasthan,"[-0.0381735, 0.0284101, 0.0333699, -0.00733359...","[0.0825477, 0.0979836, 0.0456688, 0.113165, 0....",-0.321994
24140343,aakkar_north governorate,"[0.149043, 0.220884, 0.129671, 0.180512, 0.285...","[0.0990175, 0.0315876, 0.0763511, 0.0826484, 0...",0.387779
24140344,abanga-bigne_moyen-ogooue,"[0.0609473, 0.0372103, 0.0723546, -0.00142679,...","[0.0324651, 0.0210004, 0.062029, 0.0591275, 0....",0.039166
24140345,abanoub_assiut governorate,"[0.0337231, 0.0227387, 0.0542926, 0.0696732, 0...","[0.0262316, 0.0597258, 0.0527955, 0.0261947, 0...",0.133525


In [73]:
location_mapping[location_mapping.location.str.contains("uttar")]

Unnamed: 0_level_0,location,location_post_embs,location_user_embs,location_bias
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24140381,agra_uttar pradesh,"[0.0163561, 0.0912644, 0.022996, 0.0564003, 0....","[0.308406, -0.0599296, 0.140461, 0.187802, 0.0...",0.107385
24140403,ahmedabad_uttar pradesh,"[0.100228, 0.0710225, 0.0258819, 0.128603, 0.0...","[0.0949965, 0.0854674, 0.0710013, 0.149782, 0....",0.307652
24140528,alawal_uttar pradesh,"[-0.0390414, -0.0515073, -0.0585844, -0.007947...","[0.0409647, 0.0322067, 0.112707, 0.0744967, 0....",0.648933
24140534,aligarh_uttar pradesh,"[0.0176555, 0.105855, 0.0311591, 0.0637617, 0....","[0.0464878, -0.0270681, -0.0128773, 0.309399, ...",0.093520
24140548,almora_uttarakhand,"[0.00882621, 0.0639144, 0.0206279, 0.0381731, ...","[-0.104439, 0.00966363, 0.153871, 0.105629, -0...",-0.069571
...,...,...,...,...
24145927,varanasi_uttar pradesh,"[0.03008, 0.0903881, 0.0177325, 0.0532811, 0.1...","[0.0785108, 0.0488698, -0.110044, 0.0852572, -...",0.218754
24145946,vill_uttar pradesh,"[0.0651642, 0.123794, 0.0320974, 0.0741413, 0....","[0.29733, 0.148037, 0.138092, 0.288412, -0.036...",0.476191
24146058,yamuna nagar_uttar pradesh,"[-0.0252161, 0.0768343, 0.0447366, 0.0367306, ...","[0.0550142, 0.297125, -0.188637, 0.126489, 0.1...",-0.011406
24146160,भेरी_uttar pradesh,"[0.0194763, 0.0757113, 0.017552, 0.042528, 0.1...","[0.231187, -0.046792, 0.0938207, 0.146476, -0....",0.086203


In [21]:
lines[2]

'i_1: 0.81823\n'

In [36]:
lines[-1]

'v_1976264_1: -0.384767 0.0184156 -0.198459 0.02862 0.749588 -0.465176 1.03756 0.755093 -0.237331 -0.0749291 0.257781 0.179921 0.467672 -0.0607599 -0.317543 0.528643 -0.162297 0.133466 0.210119 0.0823327 -0.352756 -0.613152 0.112987 0.0811176 -0.212291 0.0384478 0.189875 -0.0466602 -0.573383 -0.0380357 -0.00423541 0.0257472\n'

In [28]:
for i, l in enumerate(lines):
    if(l[0] == 'v'):
        break

In [29]:
print(i)

1976266


In [19]:
location_mapping.head(5)

Unnamed: 0_level_0,location,location_post_embs,location_user_embs,location_bias
mapping,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
24144028,'amran governorate,"[0.0476274, 0.0985418, 0.0945089, 0.102908, 0....","[0.0590635, 0.123265, 0.139928, 0.220338, 0.16...",0.862394
24144029,abu dhabi,"[0.0118952, 0.101399, 0.0299083, 0.056269, 0.2...","[0.203786, 0.414275, 0.219784, -0.229608, 0.41...",-0.081865
24144030,ad dakhiliyah ‍governorate,"[0.0331937, 0.17813, 0.0750565, 0.119955, 0.29...","[0.0563649, 0.0761893, 0.0458001, 0.0791271, 0...",0.333669
24144031,ad dhahirah governorate,"[0.0164509, 0.101343, 0.0367861, 0.0427484, 0....","[-0.014704, 0.0633957, 0.127622, 0.176291, 0.0...",0.169855
24144032,addis ababa,"[0.182803, 0.252469, 0.182235, 0.209159, 0.412...","[0.138104, 0.196535, 0.0724725, 0.209923, 0.21...",0.394284


In [11]:
lines[i]

'v_0_0: 9.13069e-07 0.0153469 0.0881585 0.0535119 0.0621593 0.0255465 0.00548882 0.0792049 0.0792553 0.109053 0.0447442 0.0606017 0.0969509 0.00403362 0.00623751 0.0618015 0.0783048 0.000898167 0.0447341 0.00779866 0.0487092 0.0801276 0.0687174 0.108556 0.0987245 0.0614782 0.0107298 0.0762944 0.0485357 0.0818097 0.10621 0.0889276\n'

In [14]:
float("9.13069e-07")+2

2.000000913069

In [26]:
df = pd.read_csv("./train_test_data_models/is_vp_succ2/Bengali/user_post_ffm_mapping.csv", delimiter="\t")

In [27]:
df.head(5)

Unnamed: 0,feature_name,mapping
0,1_post_1000023282,1
1,1_post_1000272282,2
2,1_post_1000759972,3
3,1_post_1000867472,4
4,1_post_1000918572,5


In [32]:
model = open(model_path).readlines()
embs = {}
for s in model:
    x = s.split(': ')
    embs[x[0]] =  list(map(float, x[1].split(' ')))

In [35]:
df.head(5).index.map(lambda x: print(x))

0
1
2
3
4


Index([None, None, None, None, None], dtype='object')

In [37]:
model[-1]

'v_1976264_1: -0.384767 0.0184156 -0.198459 0.02862 0.749588 -0.465176 1.03756 0.755093 -0.237331 -0.0749291 0.257781 0.179921 0.467672 -0.0607599 -0.317543 0.528643 -0.162297 0.133466 0.210119 0.0823327 -0.352756 -0.613152 0.112987 0.0811176 -0.212291 0.0384478 0.189875 -0.0466602 -0.573383 -0.0380357 -0.00423541 0.0257472\n'

In [41]:
for i,s in enumerate(model):
    temp = s.split(': ')
    if(temp[0][-1] == '1' and temp[0][0] != 'i'):
        break

In [42]:
i

1976267

In [44]:
model[i]

'v_0_1: 0.0306211 0.00553781 0.0858806 0.0382959 0.0738116 0.0882524 0.115627 0.042625 0.0288227 0.114637 0.0843147 0.087896 0.0760144 0.00848045 0.0736945 0.103221 0.0318178 0.0509173 0.089429 0.0557382 0.0277418 0.0320741 0.0419164 0.0194268 0.0567633 0.104732 0.10608 0.0070662 0.105548 0.058864 0.0602371 0.0372224\n'

In [64]:
mapping = pd.read_csv(user_post_mapping_path, names=['ids', 'mapping'],
                header=None, delimiter = '\t', skiprows=1)

In [65]:
mapping.head(5)

Unnamed: 0,ids,mapping
0,1_post_1000023282,1
1,1_post_1000272282,2
2,1_post_1000759972,3
3,1_post_1000867472,4
4,1_post_1000918572,5


In [66]:
user_mapping = mapping[mapping['ids'].str.contains('user_')]
user_mapping['userId'] = user_mapping['ids'].str.slice(7)

post_mapping = mapping[mapping['ids'].str.contains('post_')]
post_mapping['postId'] = mapping['ids'].str.slice(7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_mapping['userId'] = user_mapping['ids'].str.slice(7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post_mapping['postId'] = mapping['ids'].str.slice(7)


In [67]:
user_mapping.head(5).index

Int64Index([31693, 31694, 31695, 31696, 31697], dtype='int64')

In [68]:
user_mapping.head(5)

Unnamed: 0,ids,mapping,userId
31693,2_user_1000005774,31694,1000005774
31694,2_user_1000007538,31695,1000007538
31695,2_user_1000007814,31696,1000007814
31696,2_user_1000009203,31697,1000009203
31697,2_user_1000009529,31698,1000009529


In [69]:
lines[0]

'bias: -1.68722\n'

In [77]:
for i,l in enumerate(lines):
    temp = l.split(': ')
    if temp[0][-1] == '1' and temp[0][0] == 'v':
        print(temp)
        break

['v_0_1', '0.0306211 0.00553781 0.0858806 0.0382959 0.0738116 0.0882524 0.115627 0.042625 0.0288227 0.114637 0.0843147 0.087896 0.0760144 0.00848045 0.0736945 0.103221 0.0318178 0.0509173 0.089429 0.0557382 0.0277418 0.0320741 0.0419164 0.0194268 0.0567633 0.104732 0.10608 0.0070662 0.105548 0.058864 0.0602371 0.0372224\n']


In [73]:
user_mapping.tail(5)

Unnamed: 0,ids,mapping,userId
1976259,2_user_999989774,1976260,999989774
1976260,2_user_999991513,1976261,999991513
1976261,2_user_999994057,1976262,999994057
1976262,2_user_99999667,1976263,99999667
1976263,2_user_999999891,1976264,999999891


In [74]:
lines[-1]

'v_1976264_1: -0.384767 0.0184156 -0.198459 0.02862 0.749588 -0.465176 1.03756 0.755093 -0.237331 -0.0749291 0.257781 0.179921 0.467672 -0.0607599 -0.317543 0.528643 -0.162297 0.133466 0.210119 0.0823327 -0.352756 -0.613152 0.112987 0.0811176 -0.212291 0.0384478 0.189875 -0.0466602 -0.573383 -0.0380357 -0.00423541 0.0257472\n'

In [75]:
df.shape

(1976265, 2)

In [78]:
i

1976267

In [79]:
len(lines)-i

3952529

In [80]:
len(lines)

5928796

In [84]:
lines[i+3]

'v_2_0: -0.0336824 0.496715 0.270941 -0.0819325 0.561933 0.0507159 0.408108 0.318693 0.232859 0.298464 -0.12911 0.535579 0.135187 0.079769 0.506298 0.235516 -0.339947 0.475495 0.146366 0.248828 -0.148972 0.0708901 0.199863 0.301869 0.252357 0.0395289 0.0422494 0.368209 -0.284703 0.162369 -0.224932 0.333465\n'

In [86]:
lines[i-2]

'i_1976264: -0.101978\n'

In [87]:
post_mapping['embs'] = post_mapping.mapping.map(lambda x: embs['v_' + str(x) + '_0'])
post_mapping['bias'] = post_mapping.mapping.map(lambda x: embs['i_' + str(x)][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post_mapping['embs'] = post_mapping.mapping.map(lambda x: embs['v_' + str(x) + '_0'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  post_mapping['bias'] = post_mapping.mapping.map(lambda x: embs['i_' + str(x)][0])


In [90]:
user_mapping['embs'] = user_mapping.mapping.map(lambda x: embs['v_' + str(x) + '_1'])
user_mapping['bias'] = user_mapping.mapping.map(lambda x: embs['i_' + str(x)][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_mapping['embs'] = user_mapping.mapping.map(lambda x: embs['v_' + str(x) + '_1'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_mapping['bias'] = user_mapping.mapping.map(lambda x: embs['i_' + str(x)][0])


In [91]:
user_mapping.head(5)

Unnamed: 0,ids,mapping,userId,embs,bias
31693,2_user_1000005774,31694,1000005774,"[-0.0151534, 0.048237, 0.133296, 0.0609551, -0...",0.081973
31694,2_user_1000007538,31695,1000007538,"[0.0285963, 0.0662877, -0.216215, -0.121115, 0...",-2.2373
31695,2_user_1000007814,31696,1000007814,"[-0.119122, -0.0127557, 0.0055039, -0.0575958,...",-0.225457
31696,2_user_1000009203,31697,1000009203,"[0.07242, -0.0817003, 0.148537, -0.117717, -0....",-0.637548
31697,2_user_1000009529,31698,1000009529,"[0.0593121, 0.33128, 0.367663, 0.204667, 0.064...",0.281941


In [92]:
post_mapping = post_mapping[['postId', 'embs']]
user_mapping = user_mapping[['userId', 'embs']]

In [93]:
post_mapping.head(5)

Unnamed: 0,postId,embs
0,1000023282,"[0.389045, 0.593008, 0.263784, 0.505996, 0.310..."
1,1000272282,"[-0.0336824, 0.496715, 0.270941, -0.0819325, 0..."
2,1000759972,"[0.43129, 0.660865, 0.770177, 0.20388, 0.03448..."
3,1000867472,"[-0.35692, -0.173044, -0.56379, 0.599986, 0.12..."
4,1000918572,"[0.834869, -0.275304, 0.107529, 0.447371, 0.50..."


In [94]:
post_emb_mat = np.array(post_mapping.embs.to_list()) 

In [96]:
post_emb_mat.shape

(31693, 32)

In [98]:
np.linalg.norm(post_emb_mat[1])

1.6591597586144018

In [100]:
sampled_posts = post_mapping.sample(n = 12)

In [102]:
sampled

Unnamed: 0,postId,embs
11180,3743303972,"[-0.435723, -0.0230965, -0.512326, 0.252693, 0..."
27693,9362945572,"[0.522945, 0.158675, 0.4244, 0.571837, 0.05382..."
1175,1197280872,"[0.0642727, -0.169545, -0.0565719, -0.235322, ..."
29796,9700303772,"[0.12076, 0.465173, -0.0281689, 0.145544, -0.0..."
29351,9624599282,"[-0.00289742, -0.16665, 0.63338, -0.279567, -0..."
18657,5924173272,"[0.799029, -0.022157, -0.577354, 0.699148, -0...."
5684,1879161572,"[0.649723, 0.333849, -0.260854, 0.169189, -0.2..."
6958,3081592472,"[0.0692729, 0.38509, -0.0167813, 0.458627, -0...."
8870,3390776772,"[0.0757729, 0.416675, 0.130022, 0.720366, -0.0..."
22554,7534619972,"[-0.216479, -0.175487, 0.339488, -0.204452, 0...."


In [106]:
res = cosine_similarity(sampled.embs.to_list(), post_emb_mat)

In [107]:
res.shape

(12, 31693)

In [108]:
res[0]

array([-0.27951027,  0.35294845,  0.19703144, ...,  0.02860971,
        0.39252253, -0.01030776])

In [114]:
sampled.iloc[[0,3]]

Unnamed: 0,postId,embs
11180,3743303972,"[-0.435723, -0.0230965, -0.512326, 0.252693, 0..."
29796,9700303772,"[0.12076, 0.465173, -0.0281689, 0.145544, -0.0..."


In [139]:
indices = res.argsort(axis=1)[::-1][:,:10]

In [140]:
res[0][11180]

0.9999999999999998

In [141]:
[res[i, ind] for i, ind in enumerate(indices)]

[array([ 0.08153468, -0.30267704,  0.07624981,  0.22386679, -0.08093571,
         0.03187127,  0.03007138,  0.11425278,  0.10660897,  0.16885942]),
 array([ 0.23617258, -0.21271133,  0.01586302,  0.0644886 , -0.21920018,
         0.19471298, -0.04950575,  0.07408365, -0.18633004, -0.13929134]),
 array([0.13514285, 0.15736032, 0.14229764, 0.39273488, 0.00394741,
        0.11062649, 0.23718837, 0.29787811, 0.17612269, 0.08424734]),
 array([ 0.0089667 ,  0.20829271,  0.21375413,  0.21152667,  0.16970524,
         0.21529055,  0.16787119,  0.28413809, -0.04075838,  0.10559648]),
 array([-0.57228363, -0.33870237, -0.42414385, -0.53693897, -0.38059314,
        -0.27289061, -0.34281853, -0.33620253, -0.54064567, -0.38793982]),
 array([ 0.2561144 ,  0.13249073,  0.18408711,  0.16826056,  0.19570156,
         0.06045068, -0.19465555, -0.28361284,  0.40759904,  0.33858719]),
 array([ 0.21948013,  0.17650395,  0.14685628,  0.29143746,  0.15009248,
         0.18041449, -0.01083408,  0.39016242,  0

In [147]:
top_n_indices = [(-cos_sim).argsort()[:10] for cos_sim in res]

In [152]:
top_n_vals = [res[i, inds] for i, inds in enumerate(top_n_indices)]

In [153]:
top_n_vals

[array([1.        , 0.90194228, 0.88182311, 0.86245753, 0.80963296,
        0.80915742, 0.79410391, 0.7835342 , 0.77960873, 0.77613763]),
 array([1.        , 0.82848737, 0.82235853, 0.80822458, 0.80761802,
        0.79274499, 0.78857584, 0.78804687, 0.78753803, 0.78619891]),
 array([1.        , 0.82296778, 0.8129457 , 0.79872742, 0.7875285 ,
        0.78409823, 0.78183319, 0.77952027, 0.77931888, 0.77603641]),
 array([1.        , 0.90497849, 0.87886105, 0.87726867, 0.87592334,
        0.86274203, 0.855797  , 0.8503195 , 0.84864842, 0.8441641 ]),
 array([1.        , 0.86875931, 0.85718788, 0.8548015 , 0.8515302 ,
        0.84819041, 0.84338882, 0.84115634, 0.83768128, 0.83296636]),
 array([1.        , 0.89209459, 0.85999168, 0.8433774 , 0.83681543,
        0.83592928, 0.83588619, 0.82954342, 0.82363654, 0.81674845]),
 array([1.        , 0.83610381, 0.82750062, 0.82643147, 0.8093739 ,
        0.79199887, 0.79127163, 0.7893327 , 0.78488905, 0.78363522]),
 array([1.        , 0.89874026, 0.

In [156]:
sampledpost_mapping.iloc[top_n_indices[0]].postId.to_list()

['3743303972',
 '3534254082',
 '3000724872',
 '7398499872',
 '1435087872',
 '7247908972',
 '1676473082',
 '7511573872',
 '9014914082',
 '5091299082']

In [158]:
sampled["similar_posts"] = [post_mapping.iloc[inds].postId.to_list() for inds in top_n_indices]
sampled["cosine_similarity"] = top_n_vals

In [159]:
sampled.head(5)

Unnamed: 0,postId,embs,similar_posts,cosine_similarity
11180,3743303972,"[-0.435723, -0.0230965, -0.512326, 0.252693, 0...","[3743303972, 3534254082, 3000724872, 739849987...","[0.9999999999999998, 0.9019422762855831, 0.881..."
27693,9362945572,"[0.522945, 0.158675, 0.4244, 0.571837, 0.05382...","[9362945572, 9870983572, 5339548672, 708799667...","[1.0, 0.8284873721822135, 0.8223585344304476, ..."
1175,1197280872,"[0.0642727, -0.169545, -0.0565719, -0.235322, ...","[1197280872, 7680286872, 3642593772, 913160257...","[1.0, 0.8229677801859888, 0.8129457004639016, ..."
29796,9700303772,"[0.12076, 0.465173, -0.0281689, 0.145544, -0.0...","[9700303772, 3026413082, 5716425972, 502969628...","[1.0, 0.9049784864415444, 0.8788610544776726, ..."
29351,9624599282,"[-0.00289742, -0.16665, 0.63338, -0.279567, -0...","[9624599282, 5124713182, 5899439282, 347887387...","[1.0, 0.8687593095469017, 0.8571878791017087, ..."
