In [5]:
import os
import json
from pymongo import MongoClient
import redis

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017')
db = client['pinecone']
collection = db['combined']

# Connect to Redis
redis_client = redis.Redis(host='192.168.1.206', port=6379, db=0)

def get_content(lines, line_position):
  content_array = []
  for line in lines[line_position:]:
    content_array.append(line)
  # get content
  content = ' '.join(content_array)
  return content

def get_key_value_pairs(lines):
  key_value_pairs = {}
  line_position = 2
  included_keys = ['title']

  try:
    for line in lines[lines.index('---')+1:]:
      line_position += 1
      if line == '---' or line.startswith('#'):
        break
      if ': ' in line:
        key, value = line.split(': ')
        if key in included_keys:
          key_value_pairs[key.strip()] = value.strip()
  except:
    return line_position, key_value_pairs
  return line_position, key_value_pairs

def write_to_mongo(data, output_json):
  collection.insert_one(data)

def walk_directory(directory_path, total_doc_count):
  for root, dirs, files in os.walk(directory_path, source):
    for file in files:
      if file.endswith(".md"):
        data = open(os.path.join(root, file),'r').read()
        lines = data.split('\n')

        line_position, key_value_pairs = get_key_value_pairs(lines)

        # add content to key_value_pairs
        content = get_content(lines, line_position)
        key_value_pairs['content'] = content
        key_value_pairs['source'] = source
        key_value_pairs['path'] = os.path.join(root, file)
        
        # output json
        output_json = json.dumps(key_value_pairs, indent=4)

        # write to mongo
        write_to_mongo(key_value_pairs, output_json)
        total_doc_count += 1
  return total_doc_count    

expected_total = 0

source = "www"
expected_total = walk_directory('/Users/kevinbutler/Documents/GitHub/www/content/', expected_total)

source = "readme_docs"
expected_total = walk_directory('/Users/kevinbutler/Documents/GitHub/readme-docs/v1.0/', expected_total)

# Query MongoDB and 
query = {}
result = collection.count_documents(query)

while (result != expected_total):
    print(str(result) + '!=' + str(expected_total))
    # Query MongoDB
    query = {}
    result = collection.count_documents(query)
else:
  print("Job Complete")

Job Complete


In [6]:
# Query MongoDB
query = {}
result = collection.find(query)

# Convert MongoDB results to array
for row in result:
    redis_client.set(str(row['_id']), str(row['content']))

In [1]:
# Clear MongoDB
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017')
db = client['pinecone']
collection = db['combined']
collection.delete_many({})

<pymongo.results.DeleteResult at 0x10a3dfbe0>

In [7]:
redis_client.scan()

(16,
 [b'63efba222b9a121dbfe1e99d',
  b'63efba222b9a121dbfe1e97d',
  b'63efba222b9a121dbfe1ea1e',
  b'63efba222b9a121dbfe1ea0f',
  b'63efba222b9a121dbfe1ea1b',
  b'63efba222b9a121dbfe1e9b5',
  b'63efba222b9a121dbfe1e9b7',
  b'63efba222b9a121dbfe1e9d2',
  b'63efba222b9a121dbfe1e9d8',
  b'63efba222b9a121dbfe1ea3a'])

In [8]:
redis_client.get('63efba222b9a121dbfe1ea1e')

b'Pinecone makes it easy to build high-performance **vector search** applications. It\xe2\x80\x99s a managed, cloud-native vector database with a **simple API** and no infrastructure hassles.  Key benefits of Pinecone:  * Fast: Ultra-low query latency at any scale, even with billions of items. * Fresh: Live index updates when you add, edit, or delete data. * Filtered: Combine vector search with metadata filters for more relevant, faster results. * Fully managed: Easy to start, use, and scale, while we keep things running smoothly and securely.   ## Key concepts  #### Vector search  Unlike traditional search methods that revolve around keywords, it is done by indexing and searching through ML-generated representations of data \xe2\x80\x94 vector embeddings \xe2\x80\x94 to find items most similar to the query.  #### Vector embeddings  [Vector embeddings](https://www.pinecone.io/learn/vector-embeddings/), or \xe2\x80\x9cvectors,\xe2\x80\x9d are sets of floating-point numbers that represen

In [4]:
# Full Reload from scratch (Redis)
redis_client.flushall()

# Full Reload from scratch (MongoDB)
collection.delete_many({})

<pymongo.results.DeleteResult at 0x105ec7af0>