# Labeling by Document Similarity
We take a different approach to label the comments. In this approach, we use labeled comments and find similar comments to label them.

In [2]:
import os
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import sys
import re

  from .autonotebook import tqdm as notebook_tqdm


## Loading Data Sets

In [3]:

DATA_PATH = "C:/Users/doosti/Dropbox (Chapman)/Research/Research Projects/Fitness/Data/"
processed_file="processed_comments_102423.txt"
comments_file="merged_comments.csv"
labeled = "comments_activity_motives.csv"
sim_labels = "comments_similarity_labels.csv"

labeled = pd.read_csv(os.path.join(DATA_PATH, labeled))
sim_labels = pd.read_csv(os.path.join(DATA_PATH, sim_labels))
comment_length=10
with open(os.path.join(DATA_PATH,processed_file),"r", encoding="utf-8") as f:
    processed_docs = f.readlines()
length = [len(re.sub("\d+", "", x.strip()).split(',')) for x in processed_docs]
comments = pd.read_csv(os.path.join(DATA_PATH, "merged_comments.csv"))
comments = comments[comments.comment_text.notnull()].copy()

# comments['processed'] = processed_docs
# comments['length'] = length
# comments['include'] = comments.length > comment_length
# comments = comments[comments.include].copy()

print(f"the total number of comments is {comments.shape[0]+labeled.shape[0]}")

(830479, 1)


## Loading the Model

In [4]:
# model_name = "top2vec_lowercase_learn_doc2vec.model"
model_name = "top2vec_pluslabeled_learn_doc2vec_tfgpu.model"
model = Top2Vec.load(f"E:/{model_name}")

In [5]:
model.document_ids

array([     0,      1,      2, ..., 830476, 830477, 830478], dtype=int64)

## Finding Similarities

In [6]:
from pprint import pprint
pprint(labeled.columns)

Index(['channel_name', 'comment_text', 'habit', 'community', 'progress',
       'Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social'],
      dtype='object')


In [63]:
label_idx = {}
# Competence
comp_idx = labeled[labeled.Competence == 1].index.values + comments.shape[0]
label_idx['competence'] = comp_idx.tolist()
# Fitness
fit_idx = labeled[labeled.Fitness == 1].index.values + comments.shape[0]
label_idx['fitness'] = fit_idx.tolist()
# Appearance
app_idx = labeled[labeled.Appearance == 1].index.values + comments.shape[0]
label_idx['appearance'] = app_idx.tolist()
# Enjoyment
enj_idx = labeled[labeled.Enjoyment == 1].index.values + comments.shape[0]
label_idx['enjoyment'] = enj_idx.tolist()
# Social
#soc_idx = labeled[(labeled.Social == 1)].index.values + comments.shape[0]
#label_idx['social'] = soc_idx.tolist()
# Appreciation
label_idx['thanks'] = [223814, 316898, 610256, 301454, 212997, 356964, 223083, 294452, 59093]
# More
label_idx['more'] = [300363, 371792, 628093, 301235]
# Great
label_idx['great'] = [347408, 580749]


# comp_idx = labeled.iloc[:200,:][labeled.Competence == 1].index.values + comments.shape[0]
# label_idx['competence'] = comp_idx.tolist()
# # Fitness
# fit_idx = labeled.iloc[:200,:][labeled.Fitness == 1].index.values + comments.shape[0]
# label_idx['fitness'] = fit_idx.tolist()
# # Appearance
# app_idx = labeled.iloc[:200,:][labeled.Appearance == 1].index.values + comments.shape[0]
# label_idx['appearance'] = app_idx.tolist()
# # Enjoyment
# enj_idx = labeled.iloc[:200,:][labeled.Enjoyment == 1].index.values + comments.shape[0]
# label_idx['enjoyment'] = enj_idx.tolist()
# # Social
# soc_idx = labeled.iloc[:200,:][(labeled.Social == 1)].index.values + comments.shape[0]
# label_idx['social'] = soc_idx.tolist()

In [84]:
comments2 = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
print(comments2.shape)

# get similarity scores for each comment
def get_similarity_scores(idx, model, num_docs=10000):
    docs, scores, doc_ids = model.search_documents_by_documents(idx, num_docs=num_docs, ef=800000)
    return scores, doc_ids

# fill the column for each label with the similarity scores using the doc ids
def fill_column(idx, model, label):
    scores, doc_ids = get_similarity_scores(idx, model)
    comments2[label] = 0.0
    comments2.loc[doc_ids, label] = scores

# fill the columns for each label
for label, idx in label_idx.items():
    fill_column(idx, model, label)

# extra for social
soc_idx = (labeled[(labeled.Social == 1)].index.values + comments.shape[0]).tolist()
soc_neg = [2+comments.shape[0], 216508, 344659, 212698, 307772, 301844, 212609, 317443, 215525, 67716, 810299,
           217844, 102735, 102912, 103336, 16240, 304445,322119,160198] #(labeled.iloc[:350,:][(labeled.Social == 0)].index.values + comments.shape[0]).tolist()
docs, scores, doc_ids = model.search_documents_by_documents(doc_ids = soc_idx, doc_ids_neg = soc_neg, num_docs=10000)
comments2['social'] = 0.0
comments2.loc[doc_ids, 'social'] = scores

main_cols = ['competence', 'fitness', 'appearance', 'enjoyment', 'social']
extra_cols = ['thanks', 'more', 'great']

# get the max score for each comment
comments2['max_score'] = comments2[main_cols + extra_cols].max(axis=1)
# get the label for each comment
comments2['label'] = comments2[main_cols + extra_cols].idxmax(axis=1)

pprint(comments2.sample(10))


(830479, 1)
                                             comment_text  competence  \
29038   I’m a couple years into menopause but I’m goin...    0.000000   
537274  My intention was everything is perfect in this...    0.000000   
653707  Happy Independence day sir\nSuperb workout I l...    0.000000   
360488       Thanks-very grateful for your yoga approach.    0.000000   
601100  It is also my rising sign. Thank you for the c...    0.000000   
497239                   Thanks Tim another great session    0.472505   
707501  Day 1 of starting Pilates and I didn’t realize...    0.000000   
389276                                  That was great! 😊    0.000000   
182200  Day 18!! I do feel like AI 🤖 with all the ting...    0.000000   
294059  The best pilates class for me, as it allows mo...    0.000000   

         fitness  appearance  enjoyment  social  max_score       label  
29038   0.000000         0.0   0.000000     0.0   0.000000  competence  
537274  0.000000         0.0   0.00000

In [68]:
comments2.loc[comments2.max_score < 0.35, 'label'] = 'other'

In [78]:
pprint(comments2.label.value_counts())

label
competence    800643
social          9998
enjoyment       7884
appearance      6230
fitness         5724
Name: count, dtype: int64


In [85]:
for k,row in comments2.sort_values(by='social', ascending=False).iloc[:20].iterrows():
# print social score with two decimal places
    print(f"score:{row.social:.2f}, index:{k}, label index:{k-comments.shape[0]}")
    pprint(row.comment_text)
    print('------------------')

score:0.39, index:610256, label index:-217224
'Great exercise! Thanks'
------------------
score:0.37, index:390843, label index:-436637
'I have patella dislocation,can I do this yoga?'
------------------
score:0.36, index:300368, label index:-527112
'More like this, please!'
------------------
score:0.35, index:371792, label index:-455688
'More like this please!'
------------------
score:0.34, index:271309, label index:-556171
'Great flow class if you don’t have a lot of time.'
------------------
score:0.34, index:676157, label index:-151323
'Can  the gents do this exercise.....?'
------------------
score:0.34, index:347408, label index:-480072
'Great start to the day ❤🙏'
------------------
score:0.34, index:301454, label index:-526026
'Thank you so much! I feel great! ❤❤❤'
------------------
score:0.34, index:537639, label index:-289841
'More video on this, please 😍😍🙏🙏'
------------------
score:0.34, index:665769, label index:-161711
'Thank you for this exercise 🙏'
------------------


In [22]:
pprint(soc_neg)

[221981, 131228, 62165, 224507, 191522, 207289, 224867, 86560, 73700]


In [38]:
# compare the labels to the original labels
labeled['predicted_label'] = comments2.loc[comments.shape[0]:, 'label'].tolist()

In [44]:
labeled['label'] = labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].idxmax(axis=1)
labeled.loc[labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].max(axis=1)<0.5,'label'] = 'other'

#pprint(labeled.label.value_counts())
#pprint(labeled.predicted_label.value_counts())

for label in labeled.label.unique():
    print(label, labeled[labeled.label==label].shape[0])
    print(labeled[labeled.label==label].predicted_label.value_counts())
    print()

label
other         2908
Competence      35
Fitness         23
Enjoyment       20
Social          10
Appearance       3
Name: count, dtype: int64
predicted_label
other         2481
competence     202
enjoyment      160
social          88
fitness         68
Name: count, dtype: int64
Competence 35
predicted_label
other         30
enjoyment      4
competence     1
Name: count, dtype: int64

Enjoyment 20
predicted_label
other         16
fitness        2
competence     1
enjoyment      1
Name: count, dtype: int64

other 2908
predicted_label
other         2405
competence     196
enjoyment      155
social          87
fitness         65
Name: count, dtype: int64

Appearance 3
predicted_label
other    3
Name: count, dtype: int64

Fitness 23
predicted_label
other         19
competence     3
fitness        1
Name: count, dtype: int64

Social 10
predicted_label
other         8
competence    1
social        1
Name: count, dtype: int64



Unnamed: 0,channel_name,comment_text,habit,community,progress,Fitness,Competence,Appearance,Enjoyment,Social,predicted_label,label
0,Candace Cabrera,Wow that was way too advanced for me. Need to...,0,0,0,0,1,0,0,0,other,Competence
1,KinoYoga,love this video - it gives me so much motivati...,0,0,0,0,0,0,1,0,other,Enjoyment
2,Yoga With Adriene,This was so great. Thank you Adriene ‚ú®üíï‚ú...,0,0,0,0,0,0,0,0,other,other
3,Boho Beautiful Yoga,"The video is perfect too... Stil camera, sligh...",0,0,0,0,0,0,0,0,other,other
4,Yoga With Adriene,Love having Benji in the videos :) My dogs are...,0,0,0,0,0,0,0,0,other,other
