In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity



# Read the posts and comments csv files

In [2]:
subreddit = "computerscience"
scrape_order = 'hot'

In [3]:
df_posts = pd.read_csv(f"../data/raw/{subreddit}_{scrape_order}_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,spnpvh,"Books about CS (Non-Fiction, Fiction, Biograph...",5,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1,Looking for books related to computer science ...,1644546000.0
1,sp7b3q,Prim's Algorithm produces a Spanning Tree,28,0.93,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I have been studing Algorithms using Stanford ...,1644502000.0
2,spay93,Where to learn Data Structures & Algorithms fu...,11,0.87,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I want to get into coding interviews but first...,1644512000.0
3,spnvd8,Spent a few hours today making this Single Pla...,1,1.0,computerscience,/r/cprogramming/comments/spntf0/spent_a_few_ho...,0,,1644547000.0
4,spml7t,ideas for valentines day coding related gifts?,1,0.67,computerscience,https://www.reddit.com/r/computerscience/comme...,4,anyone have any idea for a cute gift i can mak...,1644543000.0


In [4]:
df_comments = pd.read_csv(f"../data/raw/{subreddit}_{scrape_order}_comments.csv")
df_comments.head()

Unnamed: 0,post_id,comment_id,parent_id,comment,up_vote_count,down_vote_count,controversiality,total_awards_received,score,is_locked,is_collapsed,is_submitter,created_utc
0,spnpvh,hwgdcg1,t3_spnpvh,I recently went through The Innovators. It giv...,1,0,0,0,1,False,False,False,1644547000.0
1,sp7b3q,hwdgju6,t3_sp7b3q,two important info to remember:\n\n1. prim's a...,9,0,0,0,9,False,False,False,1644505000.0
2,sp7b3q,hweysv2,t3_sp7b3q,Finally I got it what he is trying to argue. ...,1,0,0,0,1,False,False,True,1644525000.0
3,spay93,hwe4uwe,t3_spay93,There are excellent courses available on Cours...,3,0,0,0,3,False,False,False,1644514000.0
4,spay93,hwfrlx8,t3_spay93,"if you want introductory books, A Common Sense...",2,0,0,0,2,False,False,False,1644538000.0


In [5]:
df = df_posts.merge(df_comments, left_on='post_id', right_on='post_id', how='left')
df.head()

Unnamed: 0,post_id,title,score_x,upvote_ratio,subreddit,url,num_comments,body,created,comment_id,...,comment,up_vote_count,down_vote_count,controversiality,total_awards_received,score_y,is_locked,is_collapsed,is_submitter,created_utc
0,spnpvh,"Books about CS (Non-Fiction, Fiction, Biograph...",5,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1,Looking for books related to computer science ...,1644546000.0,hwgdcg1,...,I recently went through The Innovators. It giv...,1.0,0.0,0.0,0.0,1.0,False,False,False,1644547000.0
1,sp7b3q,Prim's Algorithm produces a Spanning Tree,28,0.93,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I have been studing Algorithms using Stanford ...,1644502000.0,hwdgju6,...,two important info to remember:\n\n1. prim's a...,9.0,0.0,0.0,0.0,9.0,False,False,False,1644505000.0
2,sp7b3q,Prim's Algorithm produces a Spanning Tree,28,0.93,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I have been studing Algorithms using Stanford ...,1644502000.0,hweysv2,...,Finally I got it what he is trying to argue. ...,1.0,0.0,0.0,0.0,1.0,False,False,True,1644525000.0
3,spay93,Where to learn Data Structures & Algorithms fu...,11,0.87,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I want to get into coding interviews but first...,1644512000.0,hwe4uwe,...,There are excellent courses available on Cours...,3.0,0.0,0.0,0.0,3.0,False,False,False,1644514000.0
4,spay93,Where to learn Data Structures & Algorithms fu...,11,0.87,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I want to get into coding interviews but first...,1644512000.0,hwfrlx8,...,"if you want introductory books, A Common Sense...",2.0,0.0,0.0,0.0,2.0,False,False,False,1644538000.0


In [6]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df.comment.isna().sum()))
#print("There are {} posts with no content after filtering".format(len(df[df['comment'].str.len() == 0])))

There are 45 posts with no comments


# Import the bert model

In [7]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Traverse through each post

In [8]:
similarities = np.array([])
for post_id in df['post_id'].unique():
    temp = df[df['post_id'] == post_id]
    if pd.isnull(temp.iloc[0]['comment']):
        similarities = np.append(similarities, 0)
        continue
    title = temp.iloc[0]['title']
    body = temp.iloc[0]['body']
    
    topic = title
    # topic = title + body
    
    comments = temp['comment'].to_list()
    comments.append(topic)
    
    sentence_embeddings = model.encode(comments)
    similarity = cosine_similarity([sentence_embeddings[-1]], sentence_embeddings[:-1])
    similarities = np.append(similarities, similarity.flatten())

In [9]:
similarities

array([0.49409187, 0.44498879, 0.33763006, ..., 0.63238215, 0.71785426,
       0.        ])

In [10]:
df['similarity'] = similarities

In [11]:
df

Unnamed: 0,post_id,title,score_x,upvote_ratio,subreddit,url,num_comments,body,created,comment_id,...,up_vote_count,down_vote_count,controversiality,total_awards_received,score_y,is_locked,is_collapsed,is_submitter,created_utc,similarity
0,spnpvh,"Books about CS (Non-Fiction, Fiction, Biograph...",5,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1,Looking for books related to computer science ...,1.644546e+09,hwgdcg1,...,1.0,0.0,0.0,0.0,1.0,False,False,False,1.644547e+09,0.494092
1,sp7b3q,Prim's Algorithm produces a Spanning Tree,28,0.93,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I have been studing Algorithms using Stanford ...,1.644502e+09,hwdgju6,...,9.0,0.0,0.0,0.0,9.0,False,False,False,1.644505e+09,0.444989
2,sp7b3q,Prim's Algorithm produces a Spanning Tree,28,0.93,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I have been studing Algorithms using Stanford ...,1.644502e+09,hweysv2,...,1.0,0.0,0.0,0.0,1.0,False,False,True,1.644525e+09,0.337630
3,spay93,Where to learn Data Structures & Algorithms fu...,11,0.87,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I want to get into coding interviews but first...,1.644512e+09,hwe4uwe,...,3.0,0.0,0.0,0.0,3.0,False,False,False,1.644514e+09,0.359265
4,spay93,Where to learn Data Structures & Algorithms fu...,11,0.87,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I want to get into coding interviews but first...,1.644512e+09,hwfrlx8,...,2.0,0.0,0.0,0.0,2.0,False,False,False,1.644538e+09,0.625972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5878,qakzly,Wrong Algorithm ? I found this algorithm on ge...,5,0.73,computerscience,/r/AskComputerScience/comments/qakxks/wrong_al...,5,,1.634557e+09,hh3kgd8,...,0.0,0.0,0.0,0.0,0.0,False,False,False,1.634558e+09,0.746216
5879,qakzly,Wrong Algorithm ? I found this algorithm on ge...,5,0.73,computerscience,/r/AskComputerScience/comments/qakxks/wrong_al...,5,,1.634557e+09,hh3rr6r,...,0.0,0.0,0.0,0.0,0.0,False,False,False,1.634563e+09,0.556039
5880,qakzly,Wrong Algorithm ? I found this algorithm on ge...,5,0.73,computerscience,/r/AskComputerScience/comments/qakxks/wrong_al...,5,,1.634557e+09,hh3l65o,...,0.0,0.0,0.0,0.0,0.0,False,False,True,1.634559e+09,0.632382
5881,qakzly,Wrong Algorithm ? I found this algorithm on ge...,5,0.73,computerscience,/r/AskComputerScience/comments/qakxks/wrong_al...,5,,1.634557e+09,hh3vtqc,...,2.0,0.0,0.0,0.0,2.0,False,False,False,1.634565e+09,0.717854


In [13]:
df.to_csv("../data/results/relevance_output.csv", index=False)

Notes: this takes a long time to run. Suggestion from another paper: use faiss.  
Also perhaps compare the comments with something else other than title

# Sample code below to generate embeddings and cosine similarity

In [6]:
sentence_embeddings = model.encode(df_posts['title'].to_list())

In [7]:
sentence_embeddings.shape

(495, 768)

In [8]:
sentence_embeddings

array([[ 0.30967617,  0.1629094 ,  1.3436381 , ..., -0.3773275 ,
        -0.74607503,  0.27187392],
       [ 0.6153924 ,  0.7614516 ,  1.5298136 , ...,  0.42983565,
        -0.63670594,  0.19039328],
       [-0.4176919 ,  0.29487124,  0.78496426, ..., -0.6611189 ,
        -0.96480453,  0.14001265],
       ...,
       [-0.5931168 ,  0.20732588, -0.35109794, ..., -0.72609115,
        -0.33478796,  0.01831246],
       [-0.20924684,  0.60646063,  0.4073564 , ...,  0.45895877,
         0.56324315, -0.05968758],
       [ 0.14121047,  0.04698378,  0.8630955 , ..., -0.47502777,
        -1.184771  ,  0.33819634]], dtype=float32)

In [None]:
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)