In [2]:
import os, pathlib, sys
from fnmatch import fnmatch
import re
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import json
import torch
import tensorflow as tf
import keras
import random
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


---
# Calculate Content Similarity based on the Average Submission for each Subreddit
---

In [None]:
# All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.
# Max Sequence Length: 	384
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
# df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)
df = pd.read_json("../data/submissions_preprocessed.json")
save_path = "submissions_embeddings.json"
df['word_embedding'] = np.nan
df

In [None]:
# def calc_embedding(row):
#     row.word_embedding = model.encode(row.selftext)

def calc_embedding(text):
    # convert_to_tensor False/True
    return model.encode(text, convert_to_tensor=False)
    # return model.encode(text, convert_to_tensor=True)

save_path = "submissions_embeddings.json"
# df.apply(lambda x: calc_embedding(x), axis=1)
df['word_embedding'] = df['selftext'].apply(calc_embedding) 

df.to_json(save_path)

In [None]:
df = pd.read_json("submissions_embeddings.json")
df

# Content Similarity

In [None]:
# df = pd.read_json("submissions_embeddings.json")
unique_subreddits = list(df.subreddit.unique())
columns = unique_subreddits
content_similarity_matrix = pd.DataFrame(columns=columns)
save_path = '../data/content_cosine_similarity_matrix.json'
save_path_csv = '../data/content_cosine_similarity_matrix.csv'


list_of_all_similarities = [] 

for subreddit in unique_subreddits:
    similarities = []
    for subreddit_to_compare in unique_subreddits:
        print('\n---\nNow comparing: ', subreddit)
        print('with: ' + subreddit_to_compare)
        # ---------------------------------------------------------------------------------
        # Special Apply Method to extract the string
        #
        print('Checkpoint: apply')
        source_df = df[df['subreddit'] == subreddit]
        target_df = df[df['subreddit'] == subreddit_to_compare]

        # source_df['avg_embedding'] = source_df.word_embedding.apply(lambda x: np.fromstring(x[1:-1], sep=' '))
        # target_df['avg_embedding'] = target_df.word_embedding.apply(lambda x: np.fromstring(x[1:-1], sep=' '))
        source_df['avg_embedding'] = source_df.word_embedding
        target_df['avg_embedding'] = target_df.word_embedding
        # ---------------------------------------------------------------------------------
        # String to Numpy Array
        #
        print('Checkpoint: to numpy')
        source_temp = source_df['avg_embedding'].to_numpy()
        target_temp = target_df['avg_embedding'].to_numpy()
        # ---------------------------------------------------------------------------------
        # Numpy Array to List
        #
        print('Checkpoint: to list')
        source_temp = source_temp.tolist()
        target_temp = target_temp.tolist()
        # ---------------------------------------------------------------------------------
        # Numpy Array to Tensor
        #
        print('Checkpoint: to tensor')
        source_avg_embeddings = torch.Tensor(source_temp)
        target_avg_embeddings = torch.Tensor(target_temp)
        source_avg_embeddings = torch.mean(source_avg_embeddings, 1)
        target_avg_embeddings = torch.mean(target_avg_embeddings, 1)
        
        # print('Checkpoint: to tensor')
        # sub_tensors = torch.from_numpy(sub_temp)
        # overlap_tensors = torch.from_numpy(overlap_temp)
        
        # Normalize
        # print('Checkpoint: normalize')
        # sub_tensors = torch.nn.functional.normalize(subreddit_avg_embeddings)
        # overlap_tensors = torch.nn.functional.normalize(overlap_avg_embeddings)
        
        print('Checkpoint: to numpy')
        a1 = source_avg_embeddings.numpy()
        a2 = target_avg_embeddings.numpy()
        
        print(a1.shape)
        # print(a2)
        print(a2.shape)
        
        # Convert to numpy array
        if(len(a1) > len(a2)):
            diff = len(a1) - len(a2)
            print("Case 1")
            a2 = np.pad(a2, (0, len(a1) - len(a2)), 'constant')
            # a2 = np.concatenate([a2, np.zeros(len(a1) - len(a2))])
            # zeros = np.zeros((diff, 0))
            # a2 = np.concatenate((a2[0], zeros))
        else:
            diff = len(a2) - len(a1)
            print("Case 2")
            a1 = np.pad(a1, (0, len(a2) - len(a1)), 'constant')
            # a1 = np.concatenate([a1, np.zeros(len(a2) - len(a1))])
            # zeros = np.zeros((diff, 0))
            # a1 = np.concatenate((a1[0], zeros))
        
        print(a1.shape)
        print(a2.shape)
        
        # Back to Tensor
        mean_embedding1 = torch.from_numpy(a1)
        mean_embedding2 = torch.from_numpy(a2)
        
        # Compute cosine-similarities
        cosine_scores = util.cos_sim(mean_embedding1, mean_embedding2)
        # cosine_scores = cosine_similarity(mean_embedding1, mean_embedding2)
        print("COSINE SCORE of the Average Embeddings:")
        print(cosine_scores[0].item())
        similarities.append(cosine_scores[0].item())
        print(" Cosine Similarities\n", similarities)
    list_of_all_similarities.append(similarities)
cosine_matrix = pd.DataFrame(list_of_all_similarities, columns=unique_subreddits)
cosine_matrix.index = unique_subreddits
# cosine_matrix.to_json(save_path)
cosine_matrix.to_csv(save_path_csv)
print("finished")
display(cosine_matrix)

# CSV to JSON

In [None]:
save_path_csv = '../data/content_cosine_similarity_matrix.csv'
save_path_json = '../data/content_cosine_similarity_matrix.json'
df = pd.read_csv(save_path_csv, index_col=0)
df.to_json(save_path_json)

# JSON for Heatmap

In [3]:
similarity_matrix = pd.read_json('../data/content_cosine_similarity_matrix.json')
similarity_matrix

Unnamed: 0,addiction,SMARTRecovery,AtheistTwelveSteppers,secularsobriety,recovery,sobrietyandrecovery,women_in_recovery,ScienceAndKindness,easyway,AdultChildren,...,ROCD,rapecounseling,needadvice,getting_over_it,hardshipmates,MMFB,bulimia,BodyDysmorphia,BodyAcceptance,SelfHate
addiction,1.000000,0.109961,0.150073,0.073335,0.312572,0.115200,0.122039,0.036841,0.017861,0.441631,...,0.589114,0.601336,0.634794,0.471169,0.126937,0.656254,0.537360,0.498807,0.217375,0.241533
SMARTRecovery,0.109961,1.000000,0.569514,0.450607,0.233723,0.633408,0.588695,0.261926,0.101785,0.199240,...,0.144809,0.151862,0.073797,0.188843,0.545894,0.118292,0.117769,0.105331,0.223165,0.315349
AtheistTwelveSteppers,0.150073,0.569514,1.000000,0.346264,0.317173,0.649873,0.720326,0.262693,0.111950,0.274066,...,0.191768,0.208770,0.104094,0.248435,0.713687,0.164147,0.165128,0.149804,0.277193,0.453248
secularsobriety,0.073335,0.450607,0.346264,1.000000,0.147473,0.403951,0.384283,0.538914,0.192432,0.100130,...,0.086036,0.083359,0.038153,0.114701,0.329988,0.065977,0.060927,0.051710,0.114750,0.172656
recovery,0.312572,0.233723,0.317173,0.147473,1.000000,0.277364,0.292747,0.106095,0.046926,0.557396,...,0.354034,0.419492,0.260511,0.523044,0.295769,0.330976,0.358785,0.297432,0.533160,0.580490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MMFB,0.656254,0.118292,0.164147,0.065977,0.330976,0.124240,0.133552,0.047700,0.021493,0.452434,...,0.632999,0.643847,0.531397,0.481232,0.138166,1.000000,0.567763,0.535459,0.218481,0.250131
bulimia,0.537360,0.117769,0.165128,0.060927,0.358785,0.136451,0.140933,0.033217,0.005219,0.494175,...,0.588854,0.656792,0.435233,0.511510,0.148735,0.567763,1.000000,0.510617,0.250662,0.267869
BodyDysmorphia,0.498807,0.105331,0.149804,0.051710,0.297432,0.104157,0.124004,0.039238,0.011548,0.413059,...,0.561851,0.578554,0.403045,0.439411,0.123202,0.535459,0.510617,1.000000,0.215872,0.246336
BodyAcceptance,0.217375,0.223165,0.277193,0.114750,0.533160,0.217469,0.268473,0.104157,0.071137,0.389009,...,0.250338,0.295218,0.188068,0.368471,0.254574,0.218481,0.250662,0.215872,1.000000,0.457419


In [6]:
# Structure for the JSON file.
data = []

# Get the list of Subreddits (for nodes).
# Index/Rows are the source subreddits.
list_subreddits = similarity_matrix.index.to_list()

for target_subreddit in list_subreddits:
    # get the X overlaps + 1 to remove the overlap with itself.
    overlaps = similarity_matrix[target_subreddit]
    for source_subreddit, value in overlaps.items():
        # print("|from| ", source_subreddit, " | to | ", target_subreddit, " | with ", value)
        # Drop eigen-overlap
        data.append({"row": source_subreddit, "col": target_subreddit, "value": round(value, 4)})

# # # Specify the file path where you want to save the JSON file
file_path = "../data/ContentSimilarityHeatmapData.json"

# # Pandas to JSON workaround
df = pd.DataFrame([data])
df.to_json(file_path, orient='records', lines=True)

In [7]:
# Specify the file path of the JSON file
file_path = "../data/ContentSimilarityHeatmapData.json"

# Read the JSON file
with open(file_path, "r") as json_file:
    data = json.load(json_file)

# Now 'data' contains the contents of the JSON file
print(data)

{'0': {'row': 'addiction', 'col': 'addiction', 'value': 1.0}, '1': {'row': 'SMARTRecovery', 'col': 'addiction', 'value': 0.11}, '2': {'row': 'AtheistTwelveSteppers', 'col': 'addiction', 'value': 0.1501}, '3': {'row': 'secularsobriety', 'col': 'addiction', 'value': 0.0733}, '4': {'row': 'recovery', 'col': 'addiction', 'value': 0.3126}, '5': {'row': 'sobrietyandrecovery', 'col': 'addiction', 'value': 0.1152}, '6': {'row': 'women_in_recovery', 'col': 'addiction', 'value': 0.122}, '7': {'row': 'ScienceAndKindness', 'col': 'addiction', 'value': 0.0368}, '8': {'row': 'easyway', 'col': 'addiction', 'value': 0.0179}, '9': {'row': 'AdultChildren', 'col': 'addiction', 'value': 0.4416}, '10': {'row': 'AlAnon', 'col': 'addiction', 'value': 0.6883}, '11': {'row': 'naranon', 'col': 'addiction', 'value': 0.2051}, '12': {'row': 'BipolarSOs', 'col': 'addiction', 'value': 0.5637}, '13': {'row': 'loveafterporn', 'col': 'addiction', 'value': 0.3802}, '14': {'row': 'cripplingalcoholism', 'col': 'addiction'