In [22]:
import json

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')

features = tokenizer(["Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?"],
                     ["External links Richard II's Treasure from the Institute of Historical Research and Royal Holloway, University of London. Richard II's Irish chancery rolls listed by year, translated, published online by CIRCLE. The Peasants' Revolt, BBC Radio 4 discussion with Miri Rubin, Caroline Barron & Alastair Dunn (In Our Time, 16 November 2006) |- 1367 births 1400 deaths 14th-century English monarchs 14th-century murdered monarchs 14th-century English nobility Burials at Westminster Abbey Deaths by starvation Dukes of Cornwall English people of French descent English pretenders to the French throne English Roman Catholics House of Plantagenet Knights of the Garter Medieval child rulers Monarchs who abdicated Peasants' Revolt People from Bordeaux Princes of Wales Prisoners in the Tower of London Peers created by Edward III Children of Edward the Black Prince", 
                      "References Sources Chronicles (1993) Chronicles of the Revolution, 1397\u20131400: The Reign of Richard II, ed. Chris Given-Wilson. Manchester: Manchester University Press. . Froissart, Jean (1978). Chronicles, ed. Geoffrey Brereton. London: Penguin. . (1977) Historia Vitae et Regni Ricardi Secundi, ed. George B. Stow. Philadelphia: University of Pennsylvania Press. . Knighton, Henry (1995). Knighton's Chronicle 1337\u20131396, ed. G. H. Martin. Oxford: Clarendon Press. . Walsingham, Thomas (1862\u201364). Historia Anglicana 2 vols., ed. Henry Thomas Riley. London: Longman, Roberts, and Green Secondary sources Alexander, Jonathan; Binski, Paul (eds.) (1987). Age of Chivalry, Art in Plantagenet England, 1200\u20131400. London: Royal Academy/Weidenfeld & Nicolson. Levey, Michael (1971). Painting at Court. London: Weidenfeld and Nicolson. External links", 
                      "John of Gaunt's brother Edmund of Langley was only one year younger, but it has been suggested that this prince was of \"limited ability\", and he took less part in government than Gaunt did. b. It has been speculated that the whole incident surrounding the killing of Wat Tyler was in fact planned in advance by the council, in order to end the rebellion. c. While both England and the Empire supported Pope Urban VI in Rome, the French sided with the Avignon Papacy of Clement VII. d. This \"appeal\"which would give its name to the Lords Appellantwas not an appeal in the modern sense of an application to a higher authority. In medieval common law the appeal was criminal charge, often one of treason."],
                      return_tensors='pt', padding=True)

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print("The first document has the similarity score:", scores[0][0].item())
    print("The second document has the similarity score:", scores[1][0].item())
    print("The third document has the similarity score:", scores[2][0].item())

# determine the highest similarity score and wich document it belongs to

similarity_scores = [scores[0][0].item(), scores[1][0].item(), scores[2][0].item()]
max_score = max(similarity_scores)
max_score_index = similarity_scores.index(max_score)

print(f"The document with the highest similarity score is document {max_score_index + 1} with a score of {max_score:.2f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The first document has the similarity score: -6.033543109893799
The second document has the similarity score: -6.910590171813965
The third document has the similarity score: 2.7864651679992676
The document with the highest similarity score is document 3 with a score of 2.79


In [23]:
# open parquet file

import pandas as pd

# Load the data from the Parquet file

df = pd.read_parquet("data/a.parquet")

In [24]:
df.tail()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442726 entries, 0 to 442725
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          442726 non-null  object
 1   title       442726 non-null  object
 2   text        442726 non-null  object
 3   categories  442726 non-null  object
dtypes: object(4)
memory usage: 13.5+ MB


In [25]:
from elasticsearch import Elasticsearch
import pyarrow.parquet as pq

# Create a connection to the Elasticsearch cluster
es = Elasticsearch([
    {'host': 'localhost', 'port': 9200, 'scheme': 'http'},  # Replace 'localhost' with the actual IP if running on a different machine
    {'host': 'es-node2', 'port': 9200, 'scheme': 'http'},
    {'host': 'es-node3', 'port': 9200, 'scheme': 'http'},
    {'host': 'es-node4', 'port': 9200, 'scheme': 'http'}
])

# Check if the connection was successful
if es.ping():
    print("Connected to Elasticsearch cluster successfully!")
else:
    print("Could not connect to Elasticsearch cluster.")

# Load a Parquet file
def load_parquet_to_es(file_path, index_name):
    # Read the Parquet file
    table = pq.read_table(file_path)
    df = table.to_pandas()
    df = df.head(5)

    # Index each row in Elasticsearch
    for _, row in df.iterrows():
        es.index(index=index_name, body=row.to_dict())

# Load a test Parquet file into Elasticsearch
test_parquet_file = "data/a.parquet"
load_parquet_to_es(test_parquet_file, 'test_index')

# Example search query
response = es.search(
    index="test_index",  # Replace with your actual index name
    body={
        "query": {
            "match_all": {}
        }
    }
)

# Print the search results
print("Search Results:")
for hit in response['hits']['hits']:
    print(json.dumps(hit, indent=4))

Connected to Elasticsearch cluster successfully!


  if es.ping():


Search Results:
{
    "_index": "test_index",
    "_type": "_doc",
    "_id": "mfDgr5MBBN8B3xkH97is",
    "_score": 1.0,
    "_ignored": [
        "text.keyword"
    ],
    "_source": {
        "id": "5677646",
        "title": "A Boy's Best Friend",
        "text": "\"A Boy's Best Friend\" is a 1975 science fiction short story by American writer Isaac Asimov. It has been collected in The Complete Robot and first appeared in Boys' Life, March 1975. ==Plot summary== Jimmy\u2019s family is settled on the Moon. Since Jimmy was born on the Moon, he is greatly accustomed to life and dangers on the Moon. Robutt, a robot-dog, was Jimmy\u2019s companion. One day his father decided to bring a real dog from the earth. He hoped that a real dog is better than Robutt. However, Jimmy was not happy to get a real dog because he had become greatly attached to Robutt ==Similarities to other stories== Quoting Asimov himself, \"you may find in it (the story) a distant echo of Robbie\". That story, written

  es.index(index=index_name, body=row.to_dict())
  response = es.search(
