In [1]:
import sys
import os

In [2]:
sys.path.append(os.path.abspath('../../app'))

In [3]:
from core.feluda import Feluda,ComponentType
from core.models.media import MediaType
from core.models.media_factory import ImageFactory
from datetime import datetime

### Initializing Feluda class with config file

We'll use two operators for this example. One for extracting embeddings from text and other for extracting text from image (newspapers for this example).

In [4]:
feluda = Feluda("find-text-config.yml")
feluda.setup()

# Extracting operator name from config and getting operators from Feluda
text_vectorizer_operator = feluda.operators.get()[feluda.config.operators.parameters[0].type] 
ocr_operator = feluda.operators.get()[feluda.config.operators.parameters[1].type]

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
if feluda.config.store:
    feluda.start_component(ComponentType.STORE)

In [6]:
newspaper_image_links = [
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news1.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news2.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news3.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news4.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news5.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news6.png",
    "https://github.com/aatmanvaidya/audio-files/raw/main/newspaper-clipings/news7.png"
]

In [7]:
def generate_document(rep, data):
    doc = {
        "e_kosh_id": "",
        "dataset":rep,
        "metadata": None,
        "text": None,
        "text_vec": data,
        "suggestion" : None,
        "lang" : None,
        "date_added": datetime.utcnow(),
    }
    return doc

def store_text(text_embedding,file_name):
    if feluda.store:
        doc = generate_document(file_name,text_embedding)
        media_type = MediaType.TEXT
        result = feluda.store[feluda.config.store.entities[0].type].store(media_type,doc)
        return("result:",result)
    else:
        raise Exception("Store is not Configured")


def search_text(text,operator):
    embedding = operator.run(text)
    if feluda.store:
        result = feluda.store[feluda.config.store.entities[0].type].find("text",embedding)
        return result
    else:
        raise Exception("Store is not Configured")

Downloading image and then extracting text from it using detect_text_in_image_tesseract operator. It is called ocr_operator in this notebook.

In [8]:
# Converting image to text

text_from_image = []

for url in newspaper_image_links:
    image_path = ImageFactory.make_from_url_to_path(url)
    text_from_image.append([ocr_operator.run(image_path["path"]),image_path["path"].split('/')[-1]])

Downloading image from URL
100% [..........................................................................] 1132191 / 1132191
Image downloaded
Downloading image from URL
100% [............................................................................] 966126 / 966126
Image downloaded
Downloading image from URL
100% [............................................................................] 789249 / 789249
Image downloaded
Downloading image from URL
100% [..........................................................................] 2335024 / 2335024
Image downloaded
Downloading image from URL
100% [..........................................................................] 1011769 / 1011769
Image downloaded
Downloading image from URL
100% [..........................................................................] 2064945 / 2064945
Image downloaded
Downloading image from URL
100% [..........................................................................] 1024285 / 1024285
Image dow

Converting text to vector using text_vec_rep_paraphrase_lxml operator. It is named as text_vectorizer_operator in this notebook.

In [9]:
# Storing text in elastic search

for data in text_from_image:
    image_text = data[0]
    file_name = data[1]

    embedding = text_vectorizer_operator.run(image_text)

    result = store_text(embedding,file_name)
    print(result)

('result:', ObjectApiResponse({'_index': 'text', '_id': 'RgclyJIBiNxwOmDoggGp', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}))
('result:', ObjectApiResponse({'_index': 'text', '_id': 'RwclyJIBiNxwOmDogwEP', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}))
('result:', ObjectApiResponse({'_index': 'text', '_id': 'SAclyJIBiNxwOmDogwF1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}))
('result:', ObjectApiResponse({'_index': 'text', '_id': 'SQclyJIBiNxwOmDogwHf', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}))
('result:', ObjectApiResponse({'_index': 'text', '_id': 'SgclyJIBiNxwOmDohAFI', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '

### Searching for Newspaper containing our query.

-> Inorder to text our operator, we pulled a sentence from newspaper and searched for newspaper that contained this query.

In [11]:
query = "Dalit families papers over WhatsApp to called for a hearing via\ntook turns, mostly girls and some official and got the job +WhatsApp video call two\nwomen, to fetch water from done,” said Mr. Saket."

In [12]:
search_text(query,text_vectorizer_operator)

calculation: 1 / (1 + l2norm(params.query_vector, 'text_vec'))


[{'doc_id': 'RgclyJIBiNxwOmDoggGp',
  'dist': 0.1120885,
  'dataset': 'news1.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'SAclyJIBiNxwOmDogwF1',
  'dist': 0.10283887,
  'dataset': 'news3.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'TAclyJIBiNxwOmDohQEE',
  'dist': 0.100596815,
  'dataset': 'news7.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'SwclyJIBiNxwOmDohAG2',
  'dist': 0.099939294,
  'dataset': 'news6.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'SgclyJIBiNxwOmDohAFI',
  'dist': 0.09927258,
  'dataset': 'news5.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'SQclyJIBiNxwOmDogwHf',
  'dist': 0.09863854,
  'dataset': 'news4.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None},
 {'doc_id': 'RwclyJIBiNxwOmDogwEP',
  'dist': 0.09863655,
  'dataset': 'news2.png',
  'e_kosh_id': '',
  'text': None,
  'metadata': None}]