In [1]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from IPython.display import display

def process_data():
    #Load similarity matrices
    cosine_sim_tfidf = pd.read_csv("patterns_cosim_tfidf.csv", index_col = 0)
    cosine_sim_embeddings = pd.read_csv("patterns_cosim_embedding.csv", index_col = 0)
    
    #NORMALIZE THE MATRICES USING MIN-MAX SCALING
    #Convert to NumPy arrays
    tfidf_matrix = cosine_sim_tfidf.values
    embedding_matrix = cosine_sim_embeddings.values
    
    #Apply MinMaxScaler
    scaler = MinMaxScaler()
    tfidf_scaled = scaler.fit_transform(tfidf_matrix)
    embedding_scaled = scaler.fit_transform(embedding_matrix)
    
    #Convert back to DF
    cosine_sim_tfidf_scaled = pd.DataFrame(tfidf_scaled, index = cosine_sim_tfidf.index, columns = cosine_sim_tfidf.columns)
    cosine_sim_embeddings_scaled = pd.DataFrame(embedding_scaled, index = cosine_sim_embeddings.index, columns = cosine_sim_embeddings.columns)
    
    #Weight factor (TODO: confirm weight accuracy and adjust if needed)
    alpha = 0.5 #equal weight applied to both methods
    
    #Compute and save hybrid similarity matrix
    cosine_sim_hybrid = (alpha * cosine_sim_tfidf_scaled) + ((1 - alpha) * cosine_sim_embeddings_scaled)
    cosine_sim_hybrid.to_csv("patterns_cosim_hybrid.csv")

#Function to use hybrid similarity matrix, returns top 5 similar patterns compared to pattern in patterns_cosim_hybrid at given index
def get_similar_patterns_hybrid(pattern_index, top_n = 5):
    df_patterns = pd.read_csv("patterns_pos.csv")
    df_patterns.index = df_patterns.index.astype(int)
    
    cosine_sim_hybrid = pd.read_csv("patterns_cosim_hybrid.csv", index_col = 0)
    cosine_sim_hybrid.index = cosine_sim_hybrid.index.astype(int)
    cosine_sim_hybrid.columns = cosine_sim_hybrid.columns.astype(int)
    
    #Get similarity scores
    similarity_scores = cosine_sim_hybrid.iloc[pattern_index]

    #Sort by similarity
    similar_patterns = similarity_scores.sort_values(ascending = False).iloc[1:top_n+1]

    #Format results
    results = pd.DataFrame({
        "Pattern Name": df_patterns.loc[similar_patterns.index, "name"].values,
        "URL": df_patterns.loc[similar_patterns.index, "url"].values,
        "Hybrid Cosine Similarity Score": similar_patterns.values
    })
    return results



Hybrid similarity matrix created and saved as patterns_cosim_hybrid.csv
