In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import ast
import re

In [2]:
df = pd.read_csv('../data/embedded.csv')

In [3]:
# Function to clean and convert string embeddings to numpy arrays
def clean_and_convert_embeddings(embedding_str):
    # Insert commas between numbers using regular expressions
    cleaned_str = re.sub(r'(?<=\d)\s+(?=[-\d])', ', ', embedding_str)

     # Ensure the string is properly formatted as a list
    if not cleaned_str.startswith('['):
        cleaned_str = '[' + cleaned_str
    if not cleaned_str.endswith(']'):
        cleaned_str = cleaned_str + ']'

    # Convert the cleaned string to a list using ast.literal_eval
    try:
        embedding_list = ast.literal_eval(cleaned_str)
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing embedding: {e}")
        embedding_list = []
    # Convert the list to a numpy array
    return np.array(embedding_list)

In [4]:
df['embedding']

0        [-4.81667295e-02  2.12718606e-01 -1.45317884e-...
1        [-2.95746941e-02  2.09249571e-01  5.49595058e-...
2        [-3.77720259e-02  1.51299909e-01  3.70151289e-...
3        [-3.18633839e-02  7.63608366e-02 -3.26305046e-...
4        [-3.87539826e-02  1.18664742e-01  2.81920545e-...
                               ...                        
86965    [-1.70188937e-02  5.82527965e-02  2.24508382e-...
86966    [-4.70052697e-02  2.03838572e-01  3.63731310e-...
86967    [-1.99257620e-02  2.14281872e-01  5.08851036e-...
86968    [-1.96716227e-02  6.39252067e-02 -1.31108984e-...
86969    [ 1.25902537e-02  4.03796375e-01  1.87184423e-...
Name: embedding, Length: 86970, dtype: object

In [5]:
df['embedding'] = df['embedding'].apply(clean_and_convert_embeddings)

Error parsing embedding: invalid syntax (<unknown>, line 1)


In [6]:
# Drop rows where the embedding column's shape is not (868,)
df = df[df['embedding'].apply(lambda x: x.shape == (868,))]

In [7]:
embeddings = np.array(df['embedding'].tolist())

In [20]:
# Reduce dimensionality using PCA
pca = PCA(n_components=100)  # Choose a number of components that balances memory usage and information retention
reduced_embeddings = pca.fit_transform(embeddings)

In [11]:
# Assuming embeddings is a numpy array of shape (86969, 100)
batch_size = 1000  # Adjust based on your memory constraints
num_articles = embeddings.shape[0]
top_k = 5


In [17]:
len(rec_titles)
len(rec_urls)

11

In [21]:
# Initialize lists to store recommended URLs and titles
rec_urls = []
rec_titles = []


In [23]:
ct = 1
for i in range(num_articles):
    if ct % 100 == 0:
        print(f"------------Processing article {ct} of {num_articles}------------")
    # Compute cosine similarity of the i-th article with all articles
    similarities = cosine_similarity(reduced_embeddings[i:i+1], reduced_embeddings).flatten()
    
    # Get indices of the top-k most similar articles (excluding the article itself)
    top_k_indices = np.argpartition(similarities, -top_k-1)[-top_k-1:]
    top_k_indices = top_k_indices[top_k_indices != i]  # Exclude self-similarity
    
    # Sort the top-k indices by similarity score
    top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])][::-1]
    
    # Retrieve URLs and titles for the top-k similar articles
    top_k_urls = df.iloc[top_k_indices]['url'].tolist()
    top_k_titles = df.iloc[top_k_indices]['title'].tolist()
    
    # Store the URLs and titles as lists of lists
    rec_urls.append(top_k_urls)
    rec_titles.append(top_k_titles)
    ct += 1

------------Processing article 100 of 86969------------
------------Processing article 200 of 86969------------
------------Processing article 300 of 86969------------
------------Processing article 400 of 86969------------
------------Processing article 500 of 86969------------
------------Processing article 600 of 86969------------
------------Processing article 700 of 86969------------
------------Processing article 800 of 86969------------
------------Processing article 900 of 86969------------
------------Processing article 1000 of 86969------------
------------Processing article 1100 of 86969------------
------------Processing article 1200 of 86969------------
------------Processing article 1300 of 86969------------
------------Processing article 1400 of 86969------------
------------Processing article 1500 of 86969------------
------------Processing article 1600 of 86969------------
------------Processing article 1700 of 86969------------
------------Processing article 1800 of 8

In [24]:
df['rec_url'] = rec_urls
df['rec_title'] = rec_titles

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86969 entries, 0 to 86969
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               86969 non-null  object
 1   title             86969 non-null  object
 2   publication_date  86969 non-null  object
 3   content           86969 non-null  object
 4   platform_id       86969 non-null  object
 5   entities          86969 non-null  object
 6   sentence_vector   86969 non-null  object
 7   content_vector    86969 non-null  object
 8   embedding         86969 non-null  object
 9   rec_url           86969 non-null  object
 10  rec_title         86969 non-null  object
dtypes: object(11)
memory usage: 8.0+ MB


In [26]:
# df.to_csv('../data/recommendations.csv', index=False)