# Labeling by Document Similarity
We take a different approach to label the comments. In this approach, we use labeled comments and find similar comments to label them.

In [2]:
import os
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import sys
import re
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


## Loading Data Sets

In [411]:

DATA_PATH = "C:/Users/doosti/Dropbox (Chapman)/Research/Research Projects/Fitness/Data/"
processed_file="processed_comments_122923.txt"
# use processed_comments_122923_fast.txt if newpreprocessed model is used (instead of newpreprocessed2)
comments_file="merged_comments.csv"
#labeled = "comments_activity_motives.csv"
#sim_labels = "comments_similarity_labels.csv"
labeled = "comments_similarity_labels.csv"

labeled = pd.read_csv(os.path.join(DATA_PATH, labeled))
#sim_labels = pd.read_csv(os.path.join(DATA_PATH, sim_labels))
comment_length=10
with open(os.path.join(DATA_PATH,processed_file),"r", encoding="utf-8") as f:
    processed_docs = f.readlines()
length_include = [len(re.sub("\d+", "", x.strip()).split(','))>5 for x in processed_docs]

comments = pd.read_csv(os.path.join(DATA_PATH, "merged_comments.csv"))
#comments = comments[comments.comment_text.notnull()].copy()

print(f"the total number of comments is {comments.shape[0]+labeled.shape[0]}")

comments2 = pd.DataFrame(data={'comment_text':comments.comment_text.tolist()+labeled.comment_text.tolist()})
comments2 = comments2[comments2.comment_text.notnull()].copy()
comments2['include'] = length_include
comments2['processed'] = processed_docs
print(f"the total number of comments processed is {comments2[comments2.include].shape[0]}")

the total number of comments is 830579
the total number of comments processed is 414616


## Loading the Model

In [317]:
# model_name = "top2vec_lowercase_learn_doc2vec.model"
# model_name = "top2vec_lowercase_newpreprocessed_deep-learn_universal-sentence-encoder.model"
model_name = "top2vec_lowercase_newpreprocessed2_deep-learn_universal-sentence-encoder-multilingual-large.model"
# model_name = "top2vec_lowercase_newpreprocessed2_deep-learn_doc2vec.model"
model = Top2Vec.load(f"E:/{model_name}")

In [410]:
model.document_ids

array([     0,      1,      2, ..., 414613, 414614, 414615], dtype=int64)

In [310]:
# Test the model with queries
queries = {}
queries['query1'] = "I want to lose weight"
queries['query2'] = "I want to be healthy"
queries['query3'] = "I love the yoga community"
queries['query4'] = "I want to be fit"
queries['query5'] = "I want to ease my anxiety"
queries['query6'] = "I want to look good"
queries['query7'] = "I love doing Yoga with other people"

In [318]:
print(f"model: {model_name}")
for k, query in queries.items():
    print(f"query: {query}")
    docus, scores, doc_ids = model.query_documents(query, num_docs=5)
    print(docus)

model: top2vec_lowercase_newpreprocessed2_deep-learn_universal-sentence-encoder-multilingual-large.model
query: I want to lose weight
['want lose weight plz help sir' 'want lose kg lose kg help'
 'want lose weight low body plz help' 'mera weight kg want lose weight'
 'hi bro want lose weight m try not m want weight']
query: I want to be healthy
['want feel good body healthy habit'
 'shoutout want good healthy change body live'
 'wish healthy life fill health joy light'
 'good k wish love health happiness'
 'want well physical mental health acceptance']
query: I love the yoga community
['love yoga yoga luna yoga community thank' 'yay love s yoga s awesome'
 'thankful beautiful yoga community awesome got'
 'absolute fav love yoga cz thank'
 'thank dedication yoga community pleasure practice namaste']
query: I want to be fit
['want feel good body healthy habit'
 'omg want body like haha tell exercise'
 'want well posture exercise slouch want exercise'
 'want lose weight low body plz help'

### Document ID conversion

In [413]:
# converting the old index to the new index
comments2['doc_id'] = np.NaN
comments2.loc[comments2.include,'doc_id'] = np.arange(comments2.include.sum())
print(comments2.head())
comments2.doc_id.describe()
id_conv = comments2.doc_id.to_dict()
for k,v in id_conv.items():
    if np.isnan(v):
        id_conv[k] = None
    else:
        id_conv[k] = int(v)

# convert an array of ids to the new index (if the id is not in the new index, it will be None)
def convert_id_l2s(x):
    # Long version to short version
    return [id_conv.get(i) for i in x if id_conv.get(i) is not None]

# convert new ids to old ids
id_conv2 = comments2[comments2.include].doc_id.to_dict()
id_conv2 = {int(v):k for k,v in id_conv2.items() if v is not None}

def convert_id_s2l(x):
    # Short version to long version
    return [id_conv2.get(i) for i in x]

                                        comment_text  include  \
0  I loved doing this video with my dad I think i...     True   
1             bre is very good at yoga same with Flo    False   
2  Of course I can add 15 more minutes to round o...     True   
3  Thanks for helping round out my routine, was q...     True   
4          First thing tomorrow morning! Thank you 🙏    False   

                                           processed  doc_id  
0                    loved,video,dad,think,th,good\n     0.0  
1                                bre,good,yoga,flo\n     NaN  
2  course,add,minute,round,morning,practice,pulse...     1.0  
3   thank,help,round,routine,stagnant,find,channel\n     2.0  
4                     thing,tomorrow,morning,thank\n     NaN  


In [414]:
# test the conversion
print(convert_id_l2s([0,1,2,3,4,5,6,7,8,9,10]))
print(convert_id_s2l([0,1,2,3,4,5,6,7,8,9,10]))

[0, 1, 2, 3]
[0, 2, 3, 7, 12, 16, 17, 19, 25, 27, 29]


## Label using survey documents

In [354]:
Enjoyment = ["it is fun", "i like to do yoga", "it makes me happy", "it is interesting", "i enjoy yoga", "i find yoga stimulating", "i like the excitement of participation"]
Competence = ["i like engaging in yoga which physically challenge me", "i want to obtain new skills", "i want to improve existing skills", "i like the challenge", "i want to keep up my current skill level", "i like activities which are physically challenging to me", "i want to get better at yoga"]
Appearance = ["i want to look better", "i want to improve my appearance", "i want to be attractive", "i want to improve my body shape"] #"i feel physically unattractive if i do not do yoga"
Fitness = ["i want to be physically fit", "i want to have more energy", "i want to improve my cardiovascular fitness", "i want to maintain my physicall strength to live a healthy life", "i want to maintain my physical health and well-being"]
Social = ["i love the community", "i want to be with my friends", "i like to be with others who are interested in this activity", "i want to meet new people", "my friends want me to", "i enjoy spending time with others doing yoga"]

In [444]:
def get_scores(queries, comment_id = True):
    """
    This function takes a list of queries and returns a dictionary of scores for each query
    Parameters:
    queries (list): a list of queries

    comment_id (bool): if True, the keys of the dictionary are comment ids, otherwise they are model doc ids

    Returns:
    scores (dict): a dictionary of average scores for each query with ids as keys
    """
    scores = {}
    num_queries = len(queries)
    for query in queries:
        docus, scores, doc_ids = model.query_documents(query, num_docs=100000, ef=400000)
        scores_dict = {}
        if comment_id:
            doc_ids = convert_id_s2l(doc_ids)
        for score, doc in zip(scores, doc_ids):
            if doc in scores:
                scores_dict[doc] += score/num_queries
            else:
                scores_dict[doc] = score/num_queries
    return scores_dict

In [445]:
# Get scores for each label
label_scores = {}
label_scores['Enjoyment'] = get_scores(Enjoyment)
label_scores['Competence'] = get_scores(Competence)
label_scores['Appearance'] = get_scores(Appearance)
label_scores['Fitness'] = get_scores(Fitness)
label_scores['Social'] = get_scores(Social)

In [446]:
# propgate the scores to the comments
comments_df = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
comments_df = comments_df[comments_df.comment_text.notnull()].copy()
comments_df['include'] = length_include
comments_df['processed'] = processed_docs

# fill the column for each label with the similarity scores using the doc ids
for label, scores in label_scores.items():
    print(label, len(scores))
    comments_df[label] = 0.0
    comments_df.loc[list(scores.keys()), label] = list(scores.values())
    
main_cols = ['Competence', 'Fitness', 'Appearance', 'Enjoyment', 'Social']

# get the max score for each comment
comments_df['max_score'] = comments_df[main_cols].max(axis=1)
# get the label for each comment
comments_df['label'] = comments_df[main_cols ].idxmax(axis=1)

pprint(comments_df[main_cols].describe())


Enjoyment 100000
Competence 100000
Appearance 100000
Fitness 100000
Social 100000
          Competence        Fitness     Appearance      Enjoyment  \
count  830479.000000  830479.000000  830479.000000  830479.000000   
mean        0.006620       0.005541       0.007764       0.004531   
std         0.018354       0.015347       0.021463       0.012433   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         0.113910       0.146740       0.142680       0.089487   

              Social  
count  830479.000000  
mean        0.007589  
std         0.021007  
min         0.000000  
25%         0.000000  
50%         0.000000  
75%         0.000000  
max         0.120060  


In [448]:
cut_off = 0.08
#comments_df.loc[comments_df.max_score < cut_off, 'label'] = 'other'
pprint(comments_df[comments_df.max_score > cut_off].label.value_counts())

label
Appearance    9569
Social        8074
Fitness        800
Competence     558
Enjoyment        4
Name: count, dtype: int64


In [478]:
def print_top_label(label,top=20, print_text=True):
    for k,row in comments_df[comments_df.label==label].sort_values(by=label, ascending=False).iloc[:top].iterrows():
        print(f"{row.label} ({row[row.label]:.2f}):")
        if print_text:
            pprint(row.comment_text)
        else:
            pprint(row.processed)
        print('------------------')

# sample social labels
label = "Appearance"
print_top_label(label, top=10)

Appearance (0.49):
'Never seen abs shaped like that. How did the gap in between come there'
------------------
Appearance (0.45):
'what beautiful graceful forms and shapes. absolutely lovely!'
------------------
Appearance (0.44):
'I am so out of shape, lol.. I did it but it was hard! I love it... thank ypu'
------------------
Appearance (0.43):
'A M A Z I N G Allie, more like these please! This was everything and more.'
------------------
Appearance (0.42):
('you look so much better in 2022. Still very slim and much more toned but not '
 'way too thin.')
------------------
Appearance (0.42):
('I always missed you.You illustrated very lucidly & charming manners. You are '
 'very attractive & lovely.')
------------------
Appearance (0.41):
'Kya sach me breast shape me aa jayega'
------------------
Appearance (0.41):
('Oh I LOVE this!!! I’ve always been an all or nothing person and this is such '
 'a beautiful way of looking at it')
------------------
Appearance (0.40):
"This looks incre

## Documents by Keywords

In [467]:
keywords = {}
keywords['Enjoyment'] = ["fun", "happy", "interesting", "excitement"] # "stimulating"
keywords['Competence'] = ["challenge", "skill", "improve", "learn"]
keywords['Appearance'] = ["look", "attractive", "shape"]
keywords['Fitness'] = ["fit", "energy", "cardio", "strength", "health"]
keywords['Social'] = ["community", "friend", "people"] # "others"

In [489]:
key_scores = {}
for label, keys in keywords.items():
    print(label)
    docus, scores, doc_ids = model.search_documents_by_keywords(keys, keywords_neg = ['\n'], num_docs=20000, ef=400000)
    doc_ids = convert_id_s2l(doc_ids)
    key_scores[label] = {doc_id:score for doc_id, score in zip(doc_ids, scores)}
    for key in keys:
        print(f"{comments_df[comments_df.comment_text.str.contains(key)].shape[0]} of {key}")
    print('------------------')

Enjoyment


ValueError: '
' has not been learned by the model so it cannot be searched.

In [469]:
# propgate the scores to the comments
# comments_df = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
# comments_df = comments_df[comments_df.comment_text.notnull()].copy()
# comments_df['include'] = length_include
# comments_df['processed'] = processed_docs

# fill the column for each label with the similarity scores using the doc ids
for label, scores in key_scores.items():
    print(label, len(scores))
    comments_df[label] = 0.0
    comments_df.loc[list(scores.keys()), label] = list(scores.values())
    
main_cols = ['Competence', 'Fitness', 'Appearance', 'Enjoyment', 'Social']

# get the max score for each comment
comments_df['max_score'] = comments_df[main_cols].max(axis=1)
# get the label for each comment
comments_df['label'] = comments_df[main_cols ].idxmax(axis=1)

pprint(comments_df[main_cols].describe())

Enjoyment 20000
Competence 20000
Appearance 20000
Fitness 20000
Social 20000
          Competence        Fitness     Appearance      Enjoyment  \
count  830479.000000  830479.000000  830479.000000  830479.000000   
mean        0.005563       0.006788       0.004899       0.007238   
std         0.035785       0.043540       0.031625       0.046490   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         0.533307       0.581013       0.530676       0.568918   

              Social  
count  830479.000000  
mean        0.004328  
std         0.028140  
min         0.000000  
25%         0.000000  
50%         0.000000  
75%         0.000000  
max         0.529961  


In [471]:
cut_off = 0.3
comments_df.loc[comments_df.max_score < cut_off, 'label'] = 'other'
pprint(comments_df[comments_df.max_score > cut_off].label.value_counts())

label
Enjoyment     7308
Fitness       4181
Competence     623
Appearance     231
Social         181
Name: count, dtype: int64


In [480]:
print_top_label('Social', top=10, print_text=False)

Social (0.52):
'ye,men,kr,skte,h,kya\n'
------------------
Social (0.51):
't,h,n,k,y,u\n'
------------------
Social (0.47):
't,h,n,k,y,o,u\n'
------------------
Social (0.47):
't,h,n,k,y,o,u\n'
------------------
Social (0.47):
't,h,n,k,y,o,u\n'
------------------
Social (0.47):
't,h,n,k,y,o,u\n'
------------------
Social (0.45):
'grateful,friend,people,life,kindness,share\n'
------------------
Social (0.44):
'day,nice,community,feel,like,people\n'
------------------
Social (0.44):
'nice,community,mean,lot,thank,guy\n'
------------------
Social (0.43):
'help,lot,person,know,stuff,good\n'
------------------


In [493]:
comments2[comments2.processed=='t,h,n,k,y,o,u\n']

Unnamed: 0,comment_text,include,processed,doc_id
299857,T H A N K Y O U,True,"t,h,n,k,y,o,u\n",163174.0
439128,t h a n k y o u !! 🌞,True,"t,h,n,k,y,o,u\n",227755.0
526973,T H A N K Y O U 🙏🏼🥰💜,True,"t,h,n,k,y,o,u\n",274038.0
788706,T h a n k Y o u 🥲,True,"t,h,n,k,y,o,u\n",394592.0


In [508]:
topic_nums, topic_scores, topic_words, word_scores = model.get_documents_topics(np.arange(414542))

In [520]:
dir(model)

590

In [523]:
docus, scores, doc_ids = model.search_documents_by_topic(590,num_docs=109)

In [499]:
topic_words, topic_scores, topic_nums = model.get_topics(591)

In [501]:
topic_words[590]

array(['frm', 'ie', 'hm', 'mmm', 'af', 'bs', 'bt', 'sm', 'ei', 'mm', 'nd',
       'hiii', 'wt', 'ytt', 'tk', 'ca', 'bta', 'fo', 'jk', 'ai', 'nt',
       'aw', 'yt', 'th', 'tks', 'tq', 'thnk', 'hv', 'ce', 'hmmm', 'fwfg',
       'rn', 'btao', 'sa', 'ye', 'ho', 'hmm', 'ngl', 'tnx', 'yr', 'bp',
       'fr', 'bcz', 'lo', 'aa', 'ta', 'ms', 'bht', 'hr', 'je'],
      dtype='<U15')

In [535]:
model.get_topic_sizes()

(array([12466,  9047,  8938,  7968,  5904,  5813,  5284,  5280,  5056,
         4831,  4729,  4486,  4303,  3621,  3571,  3340,  3039,  3001,
         2995,  2972,  2955,  2830,  2800,  2671,  2643,  2395,  2310,
         2300,  2294,  2260,  2255,  2237,  2235,  2208,  2142,  2028,
         2009,  2004,  1958,  1942,  1931,  1904,  1830,  1724,  1722,
         1700,  1691,  1668,  1596,  1579,  1565,  1558,  1546,  1510,
         1495,  1485,  1475,  1475,  1445,  1433,  1430,  1400,  1382,
         1324,  1320,  1309,  1289,  1282,  1281,  1280,  1228,  1227,
         1204,  1196,  1196,  1196,  1187,  1162,  1156,  1154,  1147,
         1142,  1138,  1129,  1113,  1091,  1078,  1076,  1072,  1049,
         1049,  1035,  1002,   999,   980,   980,   979,   975,   971,
          964,   958,   958,   957,   906,   902,   898,   896,   892,
          891,   861,   858,   854,   847,   844,   843,   842,   836,
          835,   818,   814,   805,   801,   782,   778,   775,   772,
      

In [538]:
topic_words, word_scores, topic_nums = model.get_topics()
topics_df = pd.DataFrame(data = topic_words, columns=[f"topic{i+1}" for i in range(50)])
topics_df.index = topic_nums
topics_df.to_csv(os.path.join(DATA_PATH, "topics.csv"))
# topic_words_str = [",".join(topic_word) for topic_word in topic_words]
# topics = pd.DataFrame(data={'topic_words':topic_words_str, 'topic_nums':topic_nums})
# topics.to_csv(os.path.join(DATA_PATH, "topics.csv"), index=False)

## Finding Similarities

In [319]:
pprint(labeled.columns)

Index(['channel_name', 'comment_text', 'habit', 'community', 'progress',
       'Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social',
       'predicted_label', 'label', 'included'],
      dtype='object')


In [324]:
label_idx = {}
# Competence
comp_idx = labeled[labeled.Competence == 1].index.values + comments.shape[0]
label_idx['competence'] = comp_idx.tolist()
# Fitness
fit_idx = labeled[labeled.Fitness == 1].index.values + comments.shape[0]
label_idx['fitness'] = fit_idx.tolist()
# Appearance
app_idx = labeled[labeled.Appearance == 1].index.values + comments.shape[0]
label_idx['appearance'] = app_idx.tolist()
# Enjoyment
enj_idx = labeled[labeled.Enjoyment == 1].index.values + comments.shape[0]
label_idx['enjoyment'] = enj_idx.tolist()
# Social
#soc_idx = labeled[(labeled.Social == 1)].index.values + comments.shape[0]
#label_idx['social'] = soc_idx.tolist()

## Extra categories
# Appreciation
# label_idx['thanks'] = [325, 1086, 9474, 4650, 7789, 1036, 358,
#                        223814, 316898, 610256, 301454, 212997, 356964, 223083, 294452, 59093, 344656,
#                        315086, 344725, 44930]
# More
# label_idx['more'] = [300363, 371792, 628093, 301235, 44240]
# # Great
# label_idx['great'] = [86589,
#                       102915, 347408, 580749, 103456, 314672, 44240]
# Journey
#label_idx['journey'] = [216508, 217844]

In [275]:
comments_df = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
comments_df['include'] = length_include
comments_df['processed'] = processed_docs
print(comments_df.shape)

# get similarity scores for each comment
def get_similarity_scores(idx, model, num_docs=100000):
    docs, scores, doc_ids = model.search_documents_by_documents(convert_id_l2s(idx), num_docs=num_docs, ef=400000)
    return scores, convert_id_s2l(doc_ids)

# fill the column for each label with the similarity scores using the doc ids
def fill_column(idx, model, label):
    scores, doc_ids = get_similarity_scores(idx, model)
    comments_df[label] = 0.0
    comments_df.loc[doc_ids, label] = scores

# fill the columns for each label
for label, idx in label_idx.items():
    fill_column(idx, model, label)

# extra for social
soc_idx = (labeled[(labeled.Social == 1)].index.values + comments.shape[0]).tolist()
soc_neg = [2+comments.shape[0], 344659]#, 216508, 344659, 212698, 307772, 301844, 212609, 317443, 215525, 67716, 810299,
           #217844, 102735, 102912, 103336, 16240, 304445,322119,160198] 

docs, scores, doc_ids = model.search_documents_by_documents(doc_ids = convert_id_l2s(soc_idx), doc_ids_neg = convert_id_l2s(soc_neg), num_docs=100000)
comments_df['social'] = 0.0
comments_df.loc[convert_id_s2l(doc_ids), 'social'] = scores

main_cols = ['competence', 'fitness', 'appearance', 'enjoyment', 'social']
extra_cols = [] #['thanks', 'more', 'great', 'journey']

# get the max score for each comment
comments_df['max_score'] = comments_df[main_cols + extra_cols].max(axis=1)
# get the label for each comment
comments_df['label'] = comments_df[main_cols + extra_cols].idxmax(axis=1)

pprint(comments_df.sample(10))


(830479, 3)
                                             comment_text  include  \
321152  Why were my thighs shaking uncontrollably in a...     True   
829509  My word of the day is strong.  It‚Äôs my birth...     True   
24261   Is this flow save if you have 1 finger gap dia...     True   
173481  Thank u!  I needed some TLC!  Thank u!  🧘‍♀️🧘‍...     True   
681035  I am suffering from pcod and I want to loss 15...    False   
238205  I always find day 30 hard without Adriene's re...     True   
162908                     I needed this today; thank you    False   
414752  Thank you. That felt wonderful. Do so love the...     True   
377868        and the funny thing is they never worked...    False   
614101  Learn Yoga & Dance...\n\nhttps://youtube.com/s...    False   

                                                processed  competence  \
321152      thigh,shake,uncontrollably,svasana,pose,end\n    0.000000   
829509  word,day,strong,birthday,set,stage,trip,sun,na...    0.000000  

In [283]:
cut_off = 0.6
comments_df.loc[comments_df.max_score < cut_off, 'label'] = 'other'

In [282]:
pprint(comments_df[comments_df.max_score > cut_off].label.value_counts())

label
enjoyment     27684
competence    17924
social        14611
fitness        7771
appearance        1
Name: count, dtype: int64


In [284]:
for k,row in comments_df[comments_df.label=="social"].sort_values(by='social', ascending=False).iloc[:20].iterrows():
# print social score with two decimal places
    print(f"score:{row.social:.2f}, index:{k}, label index:{k-comments.shape[0]}")
    pprint(row.comment_text)
    # pprint(row.processed)
    # print(length_include[k])
    print('------------------')

score:0.79, index:829245, label index:1765
("It's May, and I started this in January, but I'm still so proud of myself "
 'for finishing and learning to love myself along the way! Thanks for the '
 'great yoga journey Adriene!')
------------------
score:0.78, index:235090, label index:-592390
("Thank you Adriene. I'm feeling so strong, and greatful after todays "
 'practice. Love you. 😌 Thank you yoga community. Have a great rest of the '
 'day.')
------------------
score:0.78, index:180458, label index:-647022
('Amazing. I love this flow! Wonderful practice today.  I am just so loving '
 'the daily yoga, I can feel myself looking forward to the continued journey '
 'after Jan. Thank you, Adriene!')
------------------
score:0.78, index:170043, label index:-657437
('Thank you so much Adriene for sharing this amazing times for whole 30 days '
 'again. I have started yoga with you in Jan 19. \n'
 'I love to find myself and how I feel here on the mat with you and everyone '
 'else joining 

In [22]:
pprint(soc_neg)

[221981, 131228, 62165, 224507, 191522, 207289, 224867, 86560, 73700]


In [285]:
comments2[(comments2.comment_text.str.contains("thank")) & (comments2.include)]

Unnamed: 0,comment_text,include,doc_id
16,Yesterday I was searching for a morning class ...,True,5.0
55,This is one of my favourites! I put it in the ...,True,25.0
73,thank you for always being so present in your ...,True,32.0
79,I was feeling so unmotivated to do my practice...,True,33.0
87,So many morning I started with this routine an...,True,38.0
...,...,...,...
830455,"When you said ""whisper to yourself I am strong...",True,425484.0
830462,This was my favourite of all the 30 Day videos...,True,425488.0
830464,Hi! I knw this is so random and unrelated but ...,True,425490.0
830469,this video really changed my body flow...\nMan...,True,425493.0


In [290]:
# compare the labels to the original labels
labeled['predicted_label'] = comments_df.loc[comments.shape[0]:, 'label'].tolist()
labeled['included'] = comments_df.loc[comments.shape[0]:, 'include'].tolist()

In [295]:
labeled['label'] = labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].idxmax(axis=1)
labeled.loc[labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].max(axis=1)<0.5,'label'] = 'other'

#pprint(labeled.label.value_counts())
#pprint(labeled.predicted_label.value_counts())
labeled_included = labeled[labeled.included]

for label in labeled_included.label.unique():
    print(label, labeled_included[labeled_included.label==label].shape[0])
    print(labeled_included[labeled_included.label==label].predicted_label.value_counts())
    print()

Competence 32
predicted_label
other        30
social        1
enjoyment     1
Name: count, dtype: int64

Enjoyment 18
predicted_label
other      15
fitness     3
Name: count, dtype: int64

other 1958
predicted_label
other         1615
enjoyment      131
competence     104
social          56
fitness         52
Name: count, dtype: int64

Appearance 3
predicted_label
other    3
Name: count, dtype: int64

Fitness 20
predicted_label
other         16
competence     4
Name: count, dtype: int64

Social 20
predicted_label
other        18
enjoyment     2
Name: count, dtype: int64



In [298]:
comments_df.loc[list(labeled_included[labeled_included.label=="Social"].index.values + comments.shape[0])]

Unnamed: 0,comment_text,include,processed,competence,fitness,appearance,enjoyment,social,max_score,label
827516,Practiced with more than 400 people today!! Lo...,True,"practice,people,today,lovely,vinyasa,expand,da...",0.0,0.0,0.0,0.0,0.0,0.0,other
827633,Hi Tim! I also will love to keep up with the c...,True,"hi,tim,love,challenge,quarantine,bring,sense,c...",0.0,0.0,0.0,0.0,0.0,0.0,other
827637,I'm glad I could complete this class in the fi...,True,"m,glad,complete,class,hour,upload,incredible,r...",0.584763,0.540039,0.0,0.0,0.0,0.584763,other
827671,I love this community an I'm so proud to be a ...,True,"love,community,m,proud,thank,adriene,help,find...",0.0,0.0,0.0,0.0,0.0,0.0,other
827678,love this! another long one for courage 2020.....,True,"love,long,courage,gather,body,happy,mind,rest,...",0.605427,0.637968,0.0,0.656868,0.0,0.656868,enjoyment
827748,Just hit the refresh button after the practice...,True,"hit,refresh,button,practice,number,view,go,gue...",0.0,0.0,0.0,0.0,0.0,0.0,other
827882,The comment section is such a safe space... ev...,True,"comment,section,safe,space,day,love,energy,lov...",0.0,0.0,0.0,0.0,0.0,0.0,other
827900,Yell if you're coming straight from HOME! Nama...,True,"yell,come,straight,home,namaste,friend,let,con...",0.0,0.0,0.0,0.0,0.0,0.0,other
827940,"‚ÄúTears are words that need to be shed‚Äù, Pa...",True,"äútear,word,need,paul,cohelho,tear,flow,eye,to...",0.0,0.5226,0.0,0.0,0.0,0.5226,other
827974,"""Your precious life, your precious body ..."" A...",True,"precious,life,precious,body,say,move,shift,fee...",0.0,0.0,0.0,0.0,0.0,0.0,other


In [299]:
docs, scores, doc_ids = model.search_documents_by_documents(doc_ids = convert_id_l2s([827671]), num_docs=10, ef=400000)
print(convert_id_s2l(doc_ids))

[515619, 223728, 263519, 169274, 268895, 244393, 280170, 163723, 170568, 166280]


In [304]:
model.query_documents(query = "i love this community i appreciate everyone", num_docs=10, return_documents=True, ef=400000)

(array(['nice community mean lot thank guy',
        'love thank congrat m member beautiful community almost ',
        'm forever grateful community love tomorrow',
        'grateful m keep go give pride love community',
        'extremely grateful community thank put fun day  days',
        'thank adriene thank community send love',
        'definitely love know hard time lot gratitude community share thank',
        'thank dear adriene m glad continual support namaste community',
        'love community help immensely want thank true',
        'thank adriene lot love beautiful community glad share space'],
       dtype=object),
 array([0.7862143 , 0.7063819 , 0.7062247 , 0.7010541 , 0.6820182 ,
        0.6601259 , 0.6581006 , 0.65275234, 0.65250397, 0.6490565 ],
       dtype=float32),
 array([160753, 124158, 147512, 155127,  54461, 125965, 280602,  82565,
        117194, 100663], dtype=int64))