In [1]:
import numpy as np
import pandas as pd
import gensim
import ast
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

#Performs Cosine Similarity processing on Word Embeddings model
def process_data():
    #Load word2vec model
    word2vec_model = gensim.models.Word2Vec.load("patterns_word2vec.model")
    
    #Load preprocessed pattern data 
    df_patterns = pd.read_csv("patterns_pos.csv")
    
    #Function to compute the vector representation of a pattern
    def get_average_word_vector(text_tokens, model, vector_size=100):
        vectors = [model.wv[word] for word in text_tokens if word in model.wv]
        if len(vectors) == 0:
            return np.zeros(vector_size) #Returns 0 vector if no words are found
        return np.mean(vectors, axis = 0) #Returns average of word vectors
    
    #Convert filtered keywords from patterns_pos.csv to a list
    df_patterns["pos_tags"] = df_patterns["pos_tags"].apply(ast.literal_eval)
    
    #Extract words only from list for use in vector representation
    df_patterns["tokenized_text"] = df_patterns["pos_tags"].apply(lambda x: [word for word, tag in x])
    
    #Compute representation of all patterns
    df_patterns["vector_representation"] = df_patterns["tokenized_text"].apply(lambda tokens: get_average_word_vector(tokens, word2vec_model))
    
    #Convert list of vectors into a 2D NumPy array
    pattern_vectors = np.vstack(df_patterns["vector_representation"].values)
    
    #Compute cosine similarity matrix
    cosine_sim_matrix_embeddings = cosine_similarity(pattern_vectors)
    
    #Convert similarity matrix to DF
    cosine_sim_df_embeddings = pd.DataFrame(cosine_sim_matrix_embeddings, index = df_patterns.index, columns = df_patterns.index)
    
    #Convert list of vectors to strings and save
    df_patterns["vector_representation"] = df_patterns["vector_representation"].apply(lambda vec: ','.join(map(str, vec)))
    df_patterns.to_csv("patterns_with_vectors.csv", index = False)
    
    #Save the matrix
    cosine_sim_df_embeddings.to_csv("patterns_cosim_embedding.csv")
    
#Returns top 5 similar patterns compared to pattern at given index
def get_similar_patterns_embeddings(pattern_index, top_n = 5):
    #Get similarity scores for given pattern
    similarity_scores = cosine_sim_df_embeddings.iloc[pattern_index] #Might have a variable locality issue here because cosine_sim_df_embeddings is defined locally in process_data...but do we even need this function here at all? Will it ever be called?

    #Sort patterns by similarity (excluding self)
    similar_patterns = similarity_scores.sort_values(ascending = False).iloc[1:top_n+1]

    #Formatting results for readability 
    results = pd.DataFrame({
        "Pattern Name": df_patterns.loc[similar_patterns.index, "name"].values,
        "URL": df_patterns.loc[similar_patterns.index, "url"].values,
        "Cosine Similarity Score": similar_patterns.values
    })
    return results

Vector representations saved to patterns_with_vectors.csv
Cosine similarity matrix using word embeddings created and saved.
