In [7]:
import psycopg2

# Replace these values with your own database credentials
db_host = "192.168.1.2"
db_port = 55432
db_name = "lnc"
db_user = "postgres"
db_password = "postgres"

# Establish a connection to the PostgreSQL database
connection = psycopg2.connect(
    host=db_host,
    port=db_port,
    dbname=db_name,
    user=db_user,
    password=db_password
)

In [8]:
cursor = connection.cursor()

In [9]:
cursor.execute("SELECT version();")
db_version = cursor.fetchone()

In [10]:
print("Connected to PostgreSQL database. Version:", db_version)

Connected to PostgreSQL database. Version: ('PostgreSQL 15.2 (Debian 15.2-1.pgdg110+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit',)


In [11]:
from sklearn.preprocessing import normalize
import numpy as np

def normalize_l2(vector):
    float_array = np.array(vector)
    reshaped_array = float_array.reshape(1, -1)
    normalized_array = normalize(reshaped_array, norm='l2')
    normalized_list = normalized_array.tolist()[0]
    
    return normalized_list

In [None]:
import pysolr
from psycopg2 import extras
from datetime import datetime
from tqdm import tqdm


batch_size = 5000

sql = """
    select 
        i.id, feed_id, f.publisher, i.title, i.summary, i."content", 
        i.target_url, i.published_at, i.created_at, i.feed_tags, i.authors, 
        i.paragraph_embeddings_avg
    from item i
    join feed f on f.id = i.feed_id
    where i.paragraph_embeddings_avg is not null
    order by i.created_at asc
    ;
"""

cursor = connection.cursor('lnc_cursor', cursor_factory=psycopg2.extras.RealDictCursor)
cursor.itersize = batch_size 

# Retrieve documents from PostgreSQL
cursor.execute(sql)

total_rows = cursor.rowcount
print(f"Total rows ${total_rows}")
rows_processed = 0
pbar = tqdm(total=total_rows)

# Connect to Solr
solr = pysolr.Solr('http://localhost:8983/solr/lnc', always_commit=False)
               
# Index documents in a streaming fashion
i = 0
solr_documents = []
try: 
    for row in cursor:
        solr_doc = {
            'id': str(row['id']),
            'feed_id': str(row['feed_id']),
            'feed': str(row['publisher']),
            'title': str(row['title']),
            'summary': str(row['summary']),
            'content': str(row['content']),
            'target_url': str(row['target_url']),
            'published_at': datetime.strftime(row['published_at'], '%Y-%m-%dT%H:%M:%SZ'),
            'created_at': datetime.strftime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ'),
            'feed_tags': str(row['feed_tags']),
            'authors': str(row['authors']),
            'par_embeddings_dp': normalize_l2(row['paragraph_embeddings_avg'])
        }
        solr_documents.append(solr_doc)
        
        i = i + 1
        if i % batch_size == 0:
            solr.add(solr_documents, commit=True)
            solr_documents = []
            pbar.update(batch_size)
    solr.add(solr_documents, commit=True)
    
except Exception as e:
    print(f"Error indexing documents to Solr: {e}")

# Commit changes
solr.commit()

# Close connections
cursor.close()
connection.close()


Total rows $-1



0it [25:59, ?it/s][A

5000it [00:20, 248.14it/s][A
5000it [00:31, 248.14it/s][A
10000it [00:41, 241.65it/s][A
10000it [00:53, 241.65it/s][A
15000it [01:02, 240.43it/s][A
15000it [01:13, 240.43it/s][A
20000it [01:24, 234.34it/s][A
20000it [01:36, 234.34it/s][A
25000it [01:45, 233.70it/s][A
25000it [01:58, 233.70it/s][A
30000it [02:11, 217.96it/s][A
30000it [02:28, 217.96it/s][A
35000it [02:30, 230.96it/s][A
35000it [02:48, 230.96it/s][A
40000it [02:51, 233.77it/s][A
40000it [03:08, 233.77it/s][A
45000it [03:11, 240.39it/s][A
45000it [03:28, 240.39it/s][A
50000it [03:31, 241.92it/s][A
50000it [03:48, 241.92it/s][A
55000it [03:52, 240.85it/s][A
55000it [04:08, 240.85it/s][A
60000it [04:12, 243.17it/s][A
60000it [04:28, 243.17it/s][A
65000it [04:32, 244.48it/s][A
65000it [04:48, 244.48it/s][A
70000it [04:53, 242.16it/s][A
70000it [05:08, 242.16it/s][A
75000it [05:14, 242.40it/s][A
75000it [05:28, 242.40it/s][A
80000it [05:34, 244.91it/s][A
80000it [05:48, 2

In [None]:
7