In [3]:
import os
import json
from tqdm import tqdm

def split_json(file_path, chunk_size_mb=200):
    # Calculate the chunk size in bytes
    chunk_size = chunk_size_mb * 1024 * 1024

    # Ensure the file exists
    if not os.path.exists(file_path):
        print(f"File {file_path} not found.")
        return

    # Create a directory for the output chunks
    output_dir = os.path.splitext(file_path)[0] + "_chunks"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables for chunking
    current_chunk = []
    current_size = 0
    chunk_index = 0

    # Read and process the NDJSON file line by line
    with open(file_path, "r", encoding="utf-8") as file:
        print("Processing the JSON file line by line...")
        lines = file.readlines()  # Get all lines
        total_lines = len(lines)  # Total number of JSON objects

        # Set up the progress bar
        with tqdm(total=total_lines, desc="Processing lines", unit="line") as pbar:
            for line in lines:
                try:
                    # Parse the JSON object from the current line
                    json_obj = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Skipping malformed line: {e}")
                    pbar.update(1)
                    continue

                # Serialize the object to calculate its size
                json_str = json.dumps(json_obj)
                json_size = len(json_str.encode("utf-8"))

                # Check if adding this object exceeds the chunk size
                if current_size + json_size > chunk_size:
                    # Write the current chunk to a file
                    output_file = os.path.join(output_dir, f"chunk_{chunk_index}.json")
                    with open(output_file, "w", encoding="utf-8") as chunk_file:
                        json.dump(current_chunk, chunk_file, indent=4)
                    print(f"Chunk {chunk_index} written to {output_file} ({current_size / (1024 * 1024):.2f} MB)")

                    # Reset for the next chunk
                    current_chunk = []
                    current_size = 0
                    chunk_index += 1

                # Add the object to the current chunk
                current_chunk.append(json_obj)
                current_size += json_size

                # Update the progress bar
                pbar.update(1)

        # Write the last chunk if it contains any remaining data
        if current_chunk:
            output_file = os.path.join(output_dir, f"chunk_{chunk_index}.json")
            with open(output_file, "w", encoding="utf-8") as chunk_file:
                json.dump(current_chunk, chunk_file, indent=4)
            print(f"Chunk {chunk_index} written to {output_file} ({current_size / (1024 * 1024):.2f} MB)")

    print("Splitting completed.")

# Example usage
large_json_file = "data/arxiv-metadata-oai-snapshot.json"  # Replace with the path to your JSON file
split_json(large_json_file)


Processing the JSON file line by line...


Processing lines:   5%|▌         | 139425/2606260 [00:17<31:06, 1321.86line/s] 

Chunk 0 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_0.json (200.00 MB)


Processing lines:  10%|█         | 272151/2606260 [00:37<45:12, 860.41line/s]  

Chunk 1 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_1.json (200.00 MB)


Processing lines:  15%|█▌        | 398938/2606260 [00:57<23:41, 1553.26line/s] 

Chunk 2 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_2.json (200.00 MB)


Processing lines:  20%|█▉        | 519796/2606260 [01:12<24:18, 1430.52line/s] 

Chunk 3 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_3.json (200.00 MB)


Processing lines:  24%|██▍       | 638119/2606260 [01:18<01:12, 27080.27line/s]

Chunk 4 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_4.json (200.00 MB)


Processing lines:  29%|██▉       | 764542/2606260 [01:45<22:28, 1366.23line/s] 

Chunk 5 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_5.json (200.00 MB)


Processing lines:  34%|███▍      | 883679/2606260 [02:01<17:24, 1649.81line/s] 

Chunk 6 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_6.json (200.00 MB)


Processing lines:  38%|███▊      | 992503/2606260 [02:06<00:57, 28052.71line/s]

Chunk 7 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_7.json (200.00 MB)


Processing lines:  43%|████▎     | 1112068/2606260 [02:34<23:47, 1046.97line/s] 

Chunk 8 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_8.json (200.00 MB)


Processing lines:  47%|████▋     | 1228555/2606260 [02:52<21:06, 1088.10line/s] 

Chunk 9 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_9.json (200.00 MB)


Processing lines:  52%|█████▏    | 1342681/2606260 [03:06<12:10, 1729.77line/s] 

Chunk 10 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_10.json (200.00 MB)


Processing lines:  56%|█████▌    | 1453167/2606260 [03:20<12:06, 1586.56line/s] 

Chunk 11 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_11.json (200.00 MB)


Processing lines:  60%|██████    | 1564715/2606260 [03:34<11:29, 1509.98line/s] 

Chunk 12 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_12.json (200.00 MB)


Processing lines:  64%|██████▍   | 1672088/2606260 [04:00<26:14, 593.37line/s]  

Chunk 13 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_13.json (200.00 MB)


Processing lines:  68%|██████▊   | 1784484/2606260 [04:35<32:02, 427.41line/s]  

Chunk 14 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_14.json (200.00 MB)


Processing lines:  73%|███████▎  | 1894287/2606260 [05:00<19:21, 612.91line/s]  

Chunk 15 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_15.json (200.00 MB)


Processing lines:  77%|███████▋  | 1999750/2606260 [05:24<20:16, 498.41line/s]  

Chunk 16 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_16.json (200.00 MB)


Processing lines:  81%|████████  | 2108146/2606260 [06:10<1:09:11, 119.98line/s]

Chunk 17 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_17.json (200.00 MB)


Processing lines:  85%|████████▌ | 2218310/2606260 [06:42<00:59, 6547.44line/s] 

Chunk 18 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_18.json (200.00 MB)


Processing lines:  90%|█████████ | 2349226/2606260 [07:44<29:50, 143.57line/s]  

Chunk 19 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_19.json (200.00 MB)


Processing lines:  96%|█████████▋| 2508599/2606260 [08:35<11:43, 138.81line/s]  

Chunk 20 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_20.json (200.00 MB)


Processing lines: 100%|██████████| 2606260/2606260 [08:49<00:00, 4920.97line/s] 


Chunk 21 written to data/arxiv-metadata-oai-snapshot_chunks\chunk_21.json (122.78 MB)
Splitting completed.


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2')

features = tokenizer(["Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?", "Who was John of Gaunt's brother, and what was his role in government?"],
                     ["External links Richard II's Treasure from the Institute of Historical Research and Royal Holloway, University of London. Richard II's Irish chancery rolls listed by year, translated, published online by CIRCLE. The Peasants' Revolt, BBC Radio 4 discussion with Miri Rubin, Caroline Barron & Alastair Dunn (In Our Time, 16 November 2006) |- 1367 births 1400 deaths 14th-century English monarchs 14th-century murdered monarchs 14th-century English nobility Burials at Westminster Abbey Deaths by starvation Dukes of Cornwall English people of French descent English pretenders to the French throne English Roman Catholics House of Plantagenet Knights of the Garter Medieval child rulers Monarchs who abdicated Peasants' Revolt People from Bordeaux Princes of Wales Prisoners in the Tower of London Peers created by Edward III Children of Edward the Black Prince", 
                      "References Sources Chronicles (1993) Chronicles of the Revolution, 1397\u20131400: The Reign of Richard II, ed. Chris Given-Wilson. Manchester: Manchester University Press. . Froissart, Jean (1978). Chronicles, ed. Geoffrey Brereton. London: Penguin. . (1977) Historia Vitae et Regni Ricardi Secundi, ed. George B. Stow. Philadelphia: University of Pennsylvania Press. . Knighton, Henry (1995). Knighton's Chronicle 1337\u20131396, ed. G. H. Martin. Oxford: Clarendon Press. . Walsingham, Thomas (1862\u201364). Historia Anglicana 2 vols., ed. Henry Thomas Riley. London: Longman, Roberts, and Green Secondary sources Alexander, Jonathan; Binski, Paul (eds.) (1987). Age of Chivalry, Art in Plantagenet England, 1200\u20131400. London: Royal Academy/Weidenfeld & Nicolson. Levey, Michael (1971). Painting at Court. London: Weidenfeld and Nicolson. External links", 
                      "John of Gaunt's brother Edmund of Langley was only one year younger, but it has been suggested that this prince was of \"limited ability\", and he took less part in government than Gaunt did. b. It has been speculated that the whole incident surrounding the killing of Wat Tyler was in fact planned in advance by the council, in order to end the rebellion. c. While both England and the Empire supported Pope Urban VI in Rome, the French sided with the Avignon Papacy of Clement VII. d. This \"appeal\"which would give its name to the Lords Appellantwas not an appeal in the modern sense of an application to a higher authority. In medieval common law the appeal was criminal charge, often one of treason."],
                      return_tensors='pt', padding=True)

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print("The first document has the similarity score:", scores[0][0].item())
    print("The second document has the similarity score:", scores[1][0].item())
    print("The third document has the similarity score:", scores[2][0].item())

# determine the highest similarity score and wich document it belongs to

similarity_scores = [scores[0][0].item(), scores[1][0].item(), scores[2][0].item()]
max_score = max(similarity_scores)
max_score_index = similarity_scores.index(max_score)

print(f"The document with the highest similarity score is document {max_score_index + 1} with a score of {max_score:.2f}")

The first document has the similarity score: -6.033543109893799
The second document has the similarity score: -6.910589694976807
The third document has the similarity score: 2.7864646911621094
The document with the highest similarity score is document 3 with a score of 2.79


In [14]:
# open parquet file

import pandas as pd

# Load the data from the Parquet file

df = pd.read_parquet("C:/Users/linus/Downloads/a.parquet/a.parquet")

In [23]:
df.tail()

Unnamed: 0,id,title,text,categories
442721,73426511,A‘ea‘e (group),A‘ea‘e is a Hawaiian music group composed of K...,"[Hawaiian music, 2022 in music]"
442722,15845764,A∞-operad,In the theory of operads in algebra and algebr...,"[Abstract algebra, Algebraic topology]"
442723,15156877,A♭ (musical note),A (A-flat; also called la bémol) is the ninth ...,[Musical notes]
442724,2994338,A♯ (Axiom),A♯ (pronounced: A sharp) is an object-oriented...,"[Functional languages, Discontinued programmin..."
442725,22858655,A♯1 Roller Rager,"""A#1 Roller Rager"" is a song by American rock ...","[2009 singles, CKY (band) songs, 2009 songs, R..."
