In [None]:
import os, pathlib, sys
from fnmatch import fnmatch
import re
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import json
import torch
import tensorflow as tf
import keras
import random
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

---
# Calculate Content Similarity based on the Average Submission for each Subreddit
---

In [None]:
# All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.
# Max Sequence Length: 	384
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
# df = pd.read_csv("../../submissions_preprocessed.csv", index_col=0)
df = pd.read_json("../data/submissions_preprocessed.json")
save_path = "submissions_embeddings.json"
df['word_embedding'] = np.nan
df

In [None]:
# def calc_embedding(row):
#     row.word_embedding = model.encode(row.selftext)

def calc_embedding(text):
    # convert_to_tensor False/True
    return model.encode(text, convert_to_tensor=False)
    # return model.encode(text, convert_to_tensor=True)

save_path = "submissions_embeddings.json"
# df.apply(lambda x: calc_embedding(x), axis=1)
df['word_embedding'] = df['selftext'].apply(calc_embedding) 

df.to_json(save_path)

In [None]:
df = pd.read_json("submissions_embeddings.json")
df

# Content Similarity

In [None]:
# df = pd.read_json("submissions_embeddings.json")
unique_subreddits = list(df.subreddit.unique())
columns = unique_subreddits
content_similarity_matrix = pd.DataFrame(columns=columns)
save_path = '../data/content_cosine_similarity_matrix.json'
save_path_csv = '../data/content_cosine_similarity_matrix.csv'


list_of_all_similarities = [] 

for subreddit in unique_subreddits:
    similarities = []
    for subreddit_to_compare in unique_subreddits:
        print('\n---\nNow comparing: ', subreddit)
        print('with: ' + subreddit_to_compare)
        # ---------------------------------------------------------------------------------
        # Special Apply Method to extract the string
        #
        print('Checkpoint: apply')
        source_df = df[df['subreddit'] == subreddit]
        target_df = df[df['subreddit'] == subreddit_to_compare]

        # source_df['avg_embedding'] = source_df.word_embedding.apply(lambda x: np.fromstring(x[1:-1], sep=' '))
        # target_df['avg_embedding'] = target_df.word_embedding.apply(lambda x: np.fromstring(x[1:-1], sep=' '))
        source_df['avg_embedding'] = source_df.word_embedding
        target_df['avg_embedding'] = target_df.word_embedding
        # ---------------------------------------------------------------------------------
        # String to Numpy Array
        #
        print('Checkpoint: to numpy')
        source_temp = source_df['avg_embedding'].to_numpy()
        target_temp = target_df['avg_embedding'].to_numpy()
        # ---------------------------------------------------------------------------------
        # Numpy Array to List
        #
        print('Checkpoint: to list')
        source_temp = source_temp.tolist()
        target_temp = target_temp.tolist()
        # ---------------------------------------------------------------------------------
        # Numpy Array to Tensor
        #
        print('Checkpoint: to tensor')
        source_avg_embeddings = torch.Tensor(source_temp)
        target_avg_embeddings = torch.Tensor(target_temp)
        source_avg_embeddings = torch.mean(source_avg_embeddings, 1)
        target_avg_embeddings = torch.mean(target_avg_embeddings, 1)
        
        # print('Checkpoint: to tensor')
        # sub_tensors = torch.from_numpy(sub_temp)
        # overlap_tensors = torch.from_numpy(overlap_temp)
        
        # Normalize
        # print('Checkpoint: normalize')
        # sub_tensors = torch.nn.functional.normalize(subreddit_avg_embeddings)
        # overlap_tensors = torch.nn.functional.normalize(overlap_avg_embeddings)
        
        print('Checkpoint: to numpy')
        a1 = source_avg_embeddings.numpy()
        a2 = target_avg_embeddings.numpy()
        
        print(a1.shape)
        # print(a2)
        print(a2.shape)
        
        # Convert to numpy array
        if(len(a1) > len(a2)):
            diff = len(a1) - len(a2)
            print("Case 1")
            a2 = np.pad(a2, (0, len(a1) - len(a2)), 'constant')
            # a2 = np.concatenate([a2, np.zeros(len(a1) - len(a2))])
            # zeros = np.zeros((diff, 0))
            # a2 = np.concatenate((a2[0], zeros))
        else:
            diff = len(a2) - len(a1)
            print("Case 2")
            a1 = np.pad(a1, (0, len(a2) - len(a1)), 'constant')
            # a1 = np.concatenate([a1, np.zeros(len(a2) - len(a1))])
            # zeros = np.zeros((diff, 0))
            # a1 = np.concatenate((a1[0], zeros))
        
        print(a1.shape)
        print(a2.shape)
        
        # Back to Tensor
        mean_embedding1 = torch.from_numpy(a1)
        mean_embedding2 = torch.from_numpy(a2)
        
        # Compute cosine-similarities
        cosine_scores = util.cos_sim(mean_embedding1, mean_embedding2)
        # cosine_scores = cosine_similarity(mean_embedding1, mean_embedding2)
        print("COSINE SCORE of the Average Embeddings:")
        print(cosine_scores[0].item())
        similarities.append(cosine_scores[0].item())
        print(" Cosine Similarities\n", similarities)
    list_of_all_similarities.append(similarities)
cosine_matrix = pd.DataFrame(list_of_all_similarities, columns=unique_subreddits)
cosine_matrix.index = unique_subreddits
# cosine_matrix.to_json(save_path)
cosine_matrix.to_csv(save_path_csv)
print("finished")
display(cosine_matrix)

# CSV to JSON

In [None]:
save_path_csv = '../data/content_cosine_similarity_matrix.csv'
save_path_json = '../data/content_cosine_similarity_matrix.json'
df = pd.read_csv(save_path_csv, index_col=0)
df.to_json(save_path_json)