# Labeling by Document Similarity
We take a different approach to label the comments. In this approach, we use labeled comments and find similar comments to label them.

In [2]:
import os
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import sys
import re

  from .autonotebook import tqdm as notebook_tqdm


## Loading Data Sets

In [130]:

DATA_PATH = "C:/Users/doosti/Dropbox (Chapman)/Research/Research Projects/Fitness/Data/"
processed_file="processed_comments_122923.txt"
comments_file="merged_comments.csv"
#labeled = "comments_activity_motives.csv"
#sim_labels = "comments_similarity_labels.csv"
labeled = "comments_similarity_labels.csv"

labeled = pd.read_csv(os.path.join(DATA_PATH, labeled))
#sim_labels = pd.read_csv(os.path.join(DATA_PATH, sim_labels))
comment_length=10
with open(os.path.join(DATA_PATH,processed_file),"r", encoding="utf-8") as f:
    processed_docs = f.readlines()
length_include = [len(re.sub("\d+", "", x.strip()).split(','))>5 for x in processed_docs]

comments = pd.read_csv(os.path.join(DATA_PATH, "merged_comments.csv"))
comments = comments[comments.comment_text.notnull()].copy()

print(f"the total number of comments is {comments.shape[0]+labeled.shape[0]}")

comments2 = pd.DataFrame(data={'comment_text':comments.comment_text.tolist()+labeled.comment_text.tolist()})
comments2 = comments2[length_include].copy()
print(f"the total number of comments processed is {comments2.shape[0]}")

the total number of comments is 830479


## Loading the Model

In [4]:
# model_name = "top2vec_lowercase_learn_doc2vec.model"
model_name = "top2vec_lowercase_newpreprocessed_deep-learn_universal-sentence-encoder.model"
model = Top2Vec.load(f"E:/{model_name}")

In [131]:
model.document_ids

array([     0,      1,      2, ..., 830476, 830477, 830478], dtype=int64)

## Finding Similarities

In [6]:
from pprint import pprint
pprint(labeled.columns)

Index(['channel_name', 'comment_text', 'habit', 'community', 'progress',
       'Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social'],
      dtype='object')


In [162]:
label_idx = {}
# Competence
comp_idx = labeled[labeled.Competence == 1].index.values + comments.shape[0]
label_idx['competence'] = comp_idx.tolist()
# Fitness
fit_idx = labeled[labeled.Fitness == 1].index.values + comments.shape[0]
label_idx['fitness'] = fit_idx.tolist()
# Appearance
app_idx = labeled[labeled.Appearance == 1].index.values + comments.shape[0]
label_idx['appearance'] = app_idx.tolist()
# Enjoyment
enj_idx = labeled[labeled.Enjoyment == 1].index.values + comments.shape[0]
label_idx['enjoyment'] = enj_idx.tolist()
# Social
#soc_idx = labeled[(labeled.Social == 1)].index.values + comments.shape[0]
#label_idx['social'] = soc_idx.tolist()
# Appreciation
label_idx['thanks'] = [325, 1086, 9474, 4650, 7789, 1036, 358]
                       #223814, 316898, 610256, 301454, 212997, 356964, 223083, 294452, 59093, 344656,
                       #315086, 344725, 44930]
# More
label_idx['more'] = [300363, 371792, 628093, 301235, 44240]
# Great
label_idx['great'] = [86589,
                      102915, 347408, 580749, 103456, 314672, 44240]
# Journey
label_idx['journey'] = [216508, 217844]


# comp_idx = labeled.iloc[:200,:][labeled.Competence == 1].index.values + comments.shape[0]
# label_idx['competence'] = comp_idx.tolist()
# # Fitness
# fit_idx = labeled.iloc[:200,:][labeled.Fitness == 1].index.values + comments.shape[0]
# label_idx['fitness'] = fit_idx.tolist()
# # Appearance
# app_idx = labeled.iloc[:200,:][labeled.Appearance == 1].index.values + comments.shape[0]
# label_idx['appearance'] = app_idx.tolist()
# # Enjoyment
# enj_idx = labeled.iloc[:200,:][labeled.Enjoyment == 1].index.values + comments.shape[0]
# label_idx['enjoyment'] = enj_idx.tolist()
# # Social
# soc_idx = labeled.iloc[:200,:][(labeled.Social == 1)].index.values + comments.shape[0]
# label_idx['social'] = soc_idx.tolist()

In [169]:
comments2 = pd.DataFrame(data={'comment_text': comments.comment_text.tolist() + labeled.comment_text.tolist()})
print(comments2.shape)

# get similarity scores for each comment
def get_similarity_scores(idx, model, num_docs=200000):
    docs, scores, doc_ids = model.search_documents_by_documents(idx, num_docs=num_docs, ef=800000)
    return scores, doc_ids

# fill the column for each label with the similarity scores using the doc ids
def fill_column(idx, model, label):
    scores, doc_ids = get_similarity_scores(idx, model)
    comments2[label] = 0.0
    comments2.loc[doc_ids, label] = scores

# fill the columns for each label
for label, idx in label_idx.items():
    print(f"filling {label} column")
    print(f"the number of comments for {label} is {len(idx)}")
    print(label_idx[label])
    fill_column(idx, model, label)

# extra for social
soc_idx = (labeled[(labeled.Social == 1)].index.values + comments.shape[0]).tolist()
soc_neg = [2+comments.shape[0], 344659]#, 216508, 344659, 212698, 307772, 301844, 212609, 317443, 215525, 67716, 810299,
          # 217844, 102735, 102912, 103336, 16240, 304445,322119,160198] #(labeled.iloc[:350,:][(labeled.Social == 0)].index.values + comments.shape[0]).tolist()
docs, scores, doc_ids = model.search_documents_by_documents(doc_ids = soc_idx, doc_ids_neg = soc_neg, num_docs=100000)
comments2['social'] = 0.0
comments2.loc[doc_ids, 'social'] = scores

main_cols = ['competence', 'fitness', 'appearance', 'enjoyment', 'social']
extra_cols = ['thanks', 'more', 'great', 'journey']

# get the max score for each comment
comments2['max_score'] = comments2[main_cols + extra_cols].max(axis=1)
# get the label for each comment
comments2['label'] = comments2[main_cols + extra_cols].idxmax(axis=1)

pprint(comments2.sample(10))


(830479, 1)
filling competence column
the number of comments for competence is 37
[827480, 827485, 827488, 827493, 827499, 827511, 827515, 827523, 827524, 827528, 827535, 827542, 827546, 827550, 827557, 827577, 827578, 827582, 827583, 827598, 827601, 827610, 827626, 827628, 827635, 827642, 827652, 827675, 827691, 827697, 827708, 827712, 827725, 827735, 827738, 827773, 827795]
filling fitness column
the number of comments for fitness is 23
[827489, 827491, 827502, 827517, 827520, 827524, 827531, 827558, 827570, 827572, 827583, 827606, 827618, 827619, 827656, 827658, 827693, 827699, 827704, 827710, 827719, 827744, 827762]
filling appearance column
the number of comments for appearance is 3
[827487, 827533, 827555]
filling enjoyment column
the number of comments for enjoyment is 20
[827481, 827490, 827494, 827509, 827521, 827529, 827536, 827549, 827571, 827585, 827594, 827605, 827627, 827654, 827705, 827714, 827747, 827784, 827789, 827799]
filling thanks column
the number of comments for 

In [122]:
comments2.loc[comments2.max_score < 0.35, 'label'] = 'other'

In [165]:
pprint(comments2.label.value_counts())

label
competence    277630
thanks        104221
enjoyment      89794
more           82545
fitness        66472
great          57349
journey        56346
appearance     48719
social         47403
Name: count, dtype: int64


In [166]:
for k,row in comments2[comments2.label=="social"].sort_values(by='social', ascending=False).iloc[:20].iterrows():
# print social score with two decimal places
    print(f"score:{row.social:.2f}, index:{k}, label index:{k-comments.shape[0]}")
    pprint(row.comment_text)
    print('------------------')

score:0.58, index:201402, label index:-626078
('Did anyone else not see the livestream today? I missed seeing all the '
 'messages from Yogis around the world!')
------------------
score:0.56, index:315086, label index:-512394
'Namaste Travis,  thank you .'
------------------
score:0.56, index:99851, label index:-727629
'Thank you so much for this Arianna!  Happy New Year!!!'
------------------
score:0.56, index:94345, label index:-733135
'Just what I needed. Thank you Arianna!'
------------------
score:0.56, index:214708, label index:-612772
'Namaste everyone!! HERE WE GO!! ♥️'
------------------
score:0.55, index:277668, label index:-549812
"Wow, it's really amazing! I practiced with 1,800 people ❤️ Namaste everyone 🙏"
------------------
score:0.55, index:245053, label index:-582427
'😀😊Finish the messages Finish the last session day 28'
------------------
score:0.55, index:101140, label index:-726340
'Gratitude Arianna ✊🏾'
------------------
score:0.55, index:307832, label index:-519

In [129]:
comments2.iloc[102915]

comment_text    This was amazing thank you Ali xx
competence                                 0.4713
fitness                                  0.414288
appearance                               0.233232
enjoyment                                0.461796
thanks                                        0.0
more                                          0.0
great                                         0.0
journey                                  0.468692
social                                    0.48728
max_score                                 0.48728
label                                      social
Name: 102915, dtype: object

In [170]:
comments2.iloc[315086]

comment_text    Namaste Travis,  thank you .
competence                          0.538545
fitness                             0.415576
appearance                          0.277644
enjoyment                           0.467641
thanks                                   0.0
more                                     0.0
great                               0.411917
journey                             0.513758
social                              0.564466
max_score                           0.564466
label                                 social
Name: 315086, dtype: object

In [22]:
pprint(soc_neg)

[221981, 131228, 62165, 224507, 191522, 207289, 224867, 86560, 73700]


In [168]:
comments2[comments2.comment_text=="Namaste"]

Unnamed: 0,comment_text,competence,fitness,appearance,enjoyment,thanks,more,great,journey,social,max_score,label
358,Namaste,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,competence
5771,Namaste,0.0,0.0,0.000000,0.0,0.000000,0.165998,0.000000,0.000000,0.0,0.165998,more
6969,Namaste,0.0,0.0,0.000000,0.0,0.218031,0.000000,0.000000,0.000000,0.0,0.218031,thanks
15056,Namaste,0.0,0.0,0.000000,0.0,0.263799,0.000000,0.000000,0.000000,0.0,0.263799,thanks
15996,Namaste,0.0,0.0,0.000000,0.0,0.253855,0.000000,0.000000,0.205305,0.0,0.253855,thanks
...,...,...,...,...,...,...,...,...,...,...,...,...
806262,Namaste,0.0,0.0,0.000000,0.0,0.226488,0.000000,0.192195,0.000000,0.0,0.226488,thanks
824737,Namaste,0.0,0.0,0.165969,0.0,0.093947,0.000000,0.000000,0.000000,0.0,0.165969,appearance
824948,Namaste,0.0,0.0,0.000000,0.0,0.306658,0.000000,0.000000,0.000000,0.0,0.306658,thanks
825580,Namaste,0.0,0.0,0.000000,0.0,0.325408,0.000000,0.000000,0.000000,0.0,0.325408,thanks


In [38]:
# compare the labels to the original labels
labeled['predicted_label'] = comments2.loc[comments.shape[0]:, 'label'].tolist()

In [44]:
labeled['label'] = labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].idxmax(axis=1)
labeled.loc[labeled.loc[:,['Fitness', 'Competence', 'Appearance', 'Enjoyment', 'Social']].max(axis=1)<0.5,'label'] = 'other'

#pprint(labeled.label.value_counts())
#pprint(labeled.predicted_label.value_counts())

for label in labeled.label.unique():
    print(label, labeled[labeled.label==label].shape[0])
    print(labeled[labeled.label==label].predicted_label.value_counts())
    print()

label
other         2908
Competence      35
Fitness         23
Enjoyment       20
Social          10
Appearance       3
Name: count, dtype: int64
predicted_label
other         2481
competence     202
enjoyment      160
social          88
fitness         68
Name: count, dtype: int64
Competence 35
predicted_label
other         30
enjoyment      4
competence     1
Name: count, dtype: int64

Enjoyment 20
predicted_label
other         16
fitness        2
competence     1
enjoyment      1
Name: count, dtype: int64

other 2908
predicted_label
other         2405
competence     196
enjoyment      155
social          87
fitness         65
Name: count, dtype: int64

Appearance 3
predicted_label
other    3
Name: count, dtype: int64

Fitness 23
predicted_label
other         19
competence     3
fitness        1
Name: count, dtype: int64

Social 10
predicted_label
other         8
competence    1
social        1
Name: count, dtype: int64



Unnamed: 0,channel_name,comment_text,habit,community,progress,Fitness,Competence,Appearance,Enjoyment,Social,predicted_label,label
0,Candace Cabrera,Wow that was way too advanced for me. Need to...,0,0,0,0,1,0,0,0,other,Competence
1,KinoYoga,love this video - it gives me so much motivati...,0,0,0,0,0,0,1,0,other,Enjoyment
2,Yoga With Adriene,This was so great. Thank you Adriene ‚ú®üíï‚ú...,0,0,0,0,0,0,0,0,other,other
3,Boho Beautiful Yoga,"The video is perfect too... Stil camera, sligh...",0,0,0,0,0,0,0,0,other,other
4,Yoga With Adriene,Love having Benji in the videos :) My dogs are...,0,0,0,0,0,0,0,0,other,other
