# Labeling by Document Similarity
We take a different approach to label the comments. In this approach, we use labeled comments and find similar comments to label them.

In [2]:
import os
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import sys
import re

  from .autonotebook import tqdm as notebook_tqdm


## Loading Data Sets

In [4]:

DATA_PATH = "C:/Users/doosti/Dropbox (Chapman)/Research/Research Projects/Fitness/Data/"
processed_file="processed_comments_102423.txt"
comments_file="merged_comments.csv"
labeled = "comments_activity_motives.csv"
sim_labels = "comments_similarity_labels.csv"

labeled = pd.read_csv(os.path.join(DATA_PATH, labeled))
sim_labels = pd.read_csv(os.path.join(DATA_PATH, sim_labels))
comment_length=10
with open(os.path.join(DATA_PATH,processed_file),"r", encoding="utf-8") as f:
    processed_docs = f.readlines()
length = [len(re.sub("\d+", "", x.strip()).split(',')) for x in processed_docs]
comments = pd.read_csv(os.path.join(DATA_PATH, "merged_comments.csv"))
comments = comments[comments.comment_text.notnull()].copy()
comments['processed'] = processed_docs
comments['length'] = length
comments['include'] = comments.length > comment_length
comments = comments[comments.include].copy()

## Loading the Model

In [23]:
model_name = "top2vec_lowercase_learn_doc2vec.model"
model = Top2Vec.load(f"E:/{model_name}")

### Adding new comments

In [6]:
model.add_documents(labeled.comment_text.str.lower().tolist())

## Finding Similarities

In [9]:
from pprint import pprint
pprint(labeled.columns)

Index(['channel_name', 'comment_text', 'habit', 'community', 'progress',
       'Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social'],
      dtype='object')


In [7]:
label_idx = {}
# Competence
comp_idx = labeled[labeled.Competence == 1].index.values + comments.shape[0]
label_idx['competence'] = comp_idx.tolist()
# Fitness
fit_idx = labeled[labeled.Fitness == 1].index.values + comments.shape[0]
label_idx['fitness'] = fit_idx.tolist()
# Appearance
app_idx = labeled[labeled.Appearance == 1].index.values + comments.shape[0]
label_idx['appearance'] = app_idx.tolist()
# Enjoyment
enj_idx = labeled[labeled.Enjoyment == 1].index.values + comments.shape[0]
label_idx['enjoyment'] = enj_idx.tolist()
# Social
soc_idx = labeled[(labeled.Social == 1)].index.values + comments.shape[0]
label_idx['social'] = soc_idx.tolist()

# comp_idx = labeled.iloc[:200,:][labeled.Competence == 1].index.values + comments.shape[0]
# label_idx['competence'] = comp_idx.tolist()
# # Fitness
# fit_idx = labeled.iloc[:200,:][labeled.Fitness == 1].index.values + comments.shape[0]
# label_idx['fitness'] = fit_idx.tolist()
# # Appearance
# app_idx = labeled.iloc[:200,:][labeled.Appearance == 1].index.values + comments.shape[0]
# label_idx['appearance'] = app_idx.tolist()
# # Enjoyment
# enj_idx = labeled.iloc[:200,:][labeled.Enjoyment == 1].index.values + comments.shape[0]
# label_idx['enjoyment'] = enj_idx.tolist()
# # Social
# soc_idx = labeled.iloc[:200,:][(labeled.Social == 1)].index.values + comments.shape[0]
# label_idx['social'] = soc_idx.tolist()

In [10]:
comments2 = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
pprint(comments2.shape)

(224978, 1)


In [16]:
# get similarity scores for each comment
def get_similarity_scores(idx, model, num_docs=100000):
    docs, scores, doc_ids = model.search_documents_by_documents(idx, num_docs=num_docs)
    return scores, doc_ids

# fill the column for each label with the similarity scores using the doc ids
def fill_column(idx, model, label):
    scores, doc_ids = get_similarity_scores(idx, model)
    comments2[label] = 0
    comments2.loc[doc_ids, label] = scores

# fill the columns for each label
for label, idx in label_idx.items():
    fill_column(idx, model, label)

# extra for social
soc_idx = (labeled[(labeled.Social == 1)].index.values + comments.shape[0]).tolist()
soc_neg = [2+comments.shape[0], 131228, 62165, 224507, 191522, 207289, 224867, 86560,73700] #(labeled.iloc[:350,:][(labeled.Social == 0)].index.values + comments.shape[0]).tolist()
docs, scores, doc_ids = model.search_documents_by_documents(doc_ids = soc_idx, doc_ids_neg = soc_neg, num_docs=10000)
comments2.loc[doc_ids, 'social'] = scores


# get the max score for each comment
comments2['max_score'] = comments2[['competence', 'fitness', 'appearance', 'enjoyment', 'social']].max(axis=1)
# get the label for each comment
comments2['label'] = comments2[['competence', 'fitness', 'appearance', 'enjoyment', 'social']].idxmax(axis=1)

pprint(comments2.sample(10))


  comments2.loc[doc_ids, label] = scores
  comments2.loc[doc_ids, label] = scores
  comments2.loc[doc_ids, label] = scores
  comments2.loc[doc_ids, label] = scores
  comments2.loc[doc_ids, label] = scores


                                             comment_text  competence  \
49322   Love and light - 12 days - focus and attention...    0.524602   
71316   This program has helped me in such amazing way...    0.576085   
210658  Day 2 done! It was simple yet on point for me....    0.516641   
30646   Wow Michelle you're back 😁so wonderful, I miss...    0.539986   
156731  I do yoga every morning because of you 😘 Your ...    0.574177   
48907   Does anyone have any tips for doing crow witho...    0.000000   
34434   This flow felt like a warm cosy blanket. The m...    0.591400   
48513   Felt good today❤️. The runner's stretch was th...    0.555443   
204926  Great video, thank you for sharing! You have a...    0.000000   
114704  Great timing because I dont have that energy f...    0.644630   

         fitness  appearance  enjoyment    social  max_score       label  
49322   0.518579    0.000000   0.510547  0.501550   0.524602  competence  
71316   0.581533    0.365782   0.502825  0.557

In [17]:
comments2.loc[comments2.max_score < 0.6, 'label'] = 'other'

In [18]:
pprint(comments2.label.value_counts())

label
other         184614
competence     16797
social         10643
fitness         7950
enjoyment       4969
appearance         5
Name: count, dtype: int64


In [19]:
for k,row in comments2.sort_values(by='social', ascending=False).iloc[:20].iterrows():
# print social score with two decimal places
    print(f"score:{row.social:.2f}, index:{k}, label index:{k-comments.shape[0]}")
    pprint(row.comment_text)
    print('------------------')

score:0.80, index:223442, label index:1463
("I've been doing my own thing for a while and it was wonderful to come back "
 'to Yoga with Adriene.  This was exactly the practice I needed at this '
 'moment.  Thank you to everyone in the community.  Thank you to Adriene.  '
 'Namaste.')
------------------
score:0.77, index:91537, label index:-130442
('Getting to practice the one a little late today, I could not be any more '
 'grateful and humbled by these lovely acts of love in the form of daily yoga '
 'practice so thank u Adriene for these amazing session and mind opening '
 'experiences much love to u and to everyone practicing Namaste and good night '
 '💕💓💗💖💞❤🙏💓')
------------------
score:0.77, index:62165, label index:-159814
('Thank you @Yoga With Adriene for another beautiful yoga practise. ❤🙏🏽.  You '
 'are so kind and generous with making these videos for us all. i love each '
 "one, you've helped me ground and reset myself today, after a tricky morning. "
 'XX')
--------------

In [22]:
pprint(soc_neg)

[221981, 131228, 62165, 224507, 191522, 207289, 224867, 86560, 73700]


In [38]:
# compare the labels to the original labels
labeled['predicted_label'] = comments2.loc[comments.shape[0]:, 'label'].tolist()

In [44]:
labeled['label'] = labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].idxmax(axis=1)
labeled.loc[labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].max(axis=1)<0.5,'label'] = 'other'

#pprint(labeled.label.value_counts())
#pprint(labeled.predicted_label.value_counts())

for label in labeled.label.unique():
    print(label, labeled[labeled.label==label].shape[0])
    print(labeled[labeled.label==label].predicted_label.value_counts())
    print()

label
other         2908
Competence      35
Fitness         23
Enjoyment       20
Social          10
Appearance       3
Name: count, dtype: int64
predicted_label
other         2481
competence     202
enjoyment      160
social          88
fitness         68
Name: count, dtype: int64
Competence 35
predicted_label
other         30
enjoyment      4
competence     1
Name: count, dtype: int64

Enjoyment 20
predicted_label
other         16
fitness        2
competence     1
enjoyment      1
Name: count, dtype: int64

other 2908
predicted_label
other         2405
competence     196
enjoyment      155
social          87
fitness         65
Name: count, dtype: int64

Appearance 3
predicted_label
other    3
Name: count, dtype: int64

Fitness 23
predicted_label
other         19
competence     3
fitness        1
Name: count, dtype: int64

Social 10
predicted_label
other         8
competence    1
social        1
Name: count, dtype: int64



Unnamed: 0,channel_name,comment_text,habit,community,progress,Fitness,Competence,Appearance,Enjoyment,Social,predicted_label,label
0,Candace Cabrera,Wow that was way too advanced for me. Need to...,0,0,0,0,1,0,0,0,other,Competence
1,KinoYoga,love this video - it gives me so much motivati...,0,0,0,0,0,0,1,0,other,Enjoyment
2,Yoga With Adriene,This was so great. Thank you Adriene ‚ú®üíï‚ú...,0,0,0,0,0,0,0,0,other,other
3,Boho Beautiful Yoga,"The video is perfect too... Stil camera, sligh...",0,0,0,0,0,0,0,0,other,other
4,Yoga With Adriene,Love having Benji in the videos :) My dogs are...,0,0,0,0,0,0,0,0,other,other
