In [2]:
import sqlite3
import pandas as pd
from sentence_transformers import SentenceTransformer

# Connect to the SQLite database
db_path = "/Users/vesper/Desktop/LSE/Capstone Project/dissertation/arctic_shift/filtered_data/relevant_data.db"
conn = sqlite3.connect(db_path)

# Load the cleaned expert and public datasets
expert_data = pd.read_sql_query("SELECT * FROM cleaned_expert_data", conn)
public_data = pd.read_sql_query("SELECT * FROM cleaned_public_data", conn)

# Ensure the correct columns are accessed
print("Expert Data Columns:", expert_data.columns)
print("Public Data Columns:", public_data.columns)

# Initialize the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings for expert and public data
print("Calculating embeddings for expert data...")
expert_embeddings = embedding_model.encode(expert_data['cleaned_content'].tolist(), show_progress_bar=True)
print("Calculating embeddings for public data...")
public_embeddings = embedding_model.encode(public_data['cleaned_text'].tolist(), show_progress_bar=True)

# Save embeddings to the database
expert_embeddings_df = pd.DataFrame(expert_embeddings)
public_embeddings_df = pd.DataFrame(public_embeddings)

# Adding 'id' column to match the original data for merging later
expert_embeddings_df['id'] = expert_data['id']
public_embeddings_df['id'] = public_data['id']

# Save to new tables in the database
conn = sqlite3.connect(db_path)
print("Saving expert embeddings to the database...")
expert_embeddings_df.to_sql('expert_embeddings', conn, if_exists='replace', index=False)
print("Saving public embeddings to the database...")
public_embeddings_df.to_sql('public_embeddings', conn, if_exists='replace', index=False)

conn.close()
print("Embeddings calculated and saved to the SQLite database.")


Expert Data Columns: Index(['id', 'original_id', 'source', 'title', 'author', 'publication_date',
       'content', 'chunk_id', 'cleaned_content'],
      dtype='object')
Public Data Columns: Index(['id', 'text', 'author', 'created_utc', 'permalink', 'score',
       'cleaned_text'],
      dtype='object')




Calculating embeddings for expert data...


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating embeddings for public data...


Batches:   0%|          | 0/635 [00:00<?, ?it/s]

Saving expert embeddings to the database...
Saving public embeddings to the database...
Embeddings calculated and saved to the SQLite database.


# normalize the scores and adjust the embeddings

In [3]:
import sqlite3
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Connect to the SQLite database
db_path = "/Users/vesper/Desktop/LSE/Capstone Project/dissertation/arctic_shift/filtered_data/relevant_data.db"
conn = sqlite3.connect(db_path)

# Load the cleaned public dataset and existing embeddings
public_data = pd.read_sql_query("SELECT * FROM cleaned_public_data", conn)
public_embeddings_df = pd.read_sql_query("SELECT * FROM public_embeddings", conn)

# Ensure the correct columns are accessed
print("Public Data Columns:", public_data.columns)
print("Public Embeddings Columns:", public_embeddings_df.columns)

# Normalize scores between 0 and 1
scaler = MinMaxScaler()

# Assuming score normalization is only relevant for public data
public_scores = public_data['score'].values.reshape(-1, 1)
public_scores_normalized = scaler.fit_transform(public_scores).flatten()

# Convert embeddings DataFrame to numpy array
public_embeddings = public_embeddings_df.drop(columns=['id']).values

# Adjust embeddings by normalized scores
weighted_public_embeddings = public_embeddings * public_scores_normalized[:, np.newaxis]

# Save the adjusted embeddings back to the database
weighted_public_embeddings_df = pd.DataFrame(weighted_public_embeddings)

# Adding 'id' column to match the original data for merging later
weighted_public_embeddings_df['id'] = public_embeddings_df['id']

# Save to a new table in the database
print("Saving weighted public embeddings to the database...")
weighted_public_embeddings_df.to_sql('weighted_public_embeddings', conn, if_exists='replace', index=False)

conn.close()
print("Weighted public embeddings calculated and saved to the SQLite database.")


Public Data Columns: Index(['id', 'text', 'author', 'created_utc', 'permalink', 'score',
       'cleaned_text'],
      dtype='object')
Public Embeddings Columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '375', '376', '377', '378', '379', '380', '381', '382', '383', 'id'],
      dtype='object', length=385)
Saving weighted public embeddings to the database...
Weighted public embeddings calculated and saved to the SQLite database.
