# RGU-IIT CBR Retrieval Simulator

## Import Libraries

In [2]:
!pip install datasets

from tqdm import tqdm
from typing import *
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import requests
import pandas as pd
import json

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

## CBR system

In [3]:
class Case:
    def __init__(self, index, question, matching_question_embeddings, text, retrieval_text_embeddings, keywords, retrieval_keywords_embeddings, answer):
        self.index = index
        self.question = question
        self.matching_question_embeddings = np.array(eval(matching_question_embeddings))
        self.text = text
        self.retrieval_text_embeddings = np.array(eval(retrieval_text_embeddings))
        self.keywords = keywords
        self.retrieval_keywords_embeddings = np.array(eval(retrieval_keywords_embeddings))
        self.answer = answer

case_database = []

class CbrSystem:
    def __init__(self, cases):
        self.cases = cases
        self.global_matching_question_embeddings = []
        self.global_retrieval_text_embeddings = []
        self.global_retrieval_keyword_embeddings = []
        cases = cases
        for case in self.cases:
            self.global_matching_question_embeddings.append(case.matching_question_embeddings)
            self.global_retrieval_text_embeddings.append(case.retrieval_text_embeddings)
            self.global_retrieval_keyword_embeddings.append(case.retrieval_keywords_embeddings)

    def retrieve_matches(self, matching_query_embeddings, retrieval_query_embeddings, document_count, question_weight, snippet_weight, keywords_weight):
        question_similarities = np.zeros(len(self.cases))
        snippet_similarities = np.zeros(len(self.cases))
        keyword_similarities = np.zeros(len(self.cases))

        if question_weight > 0:
            question_similarities = np.array([cosine_similarity([matching_query_embeddings], [matching_question_embedding])[0][0] for matching_question_embedding in self.global_matching_question_embeddings])
        if snippet_weight > 0:
            snippet_similarities = np.array([cosine_similarity([retrieval_query_embeddings], [retrieval_text_embeddings])[0][0] for retrieval_text_embeddings in self.global_retrieval_text_embeddings])
        if keywords_weight > 0:
            keyword_similarities = np.array([cosine_similarity([retrieval_query_embeddings], [retrieval_keyword_embeddings])[0][0] for retrieval_keyword_embeddings in self.global_retrieval_keyword_embeddings])

        combined_similarities = (question_weight * np.array(question_similarities)) + (snippet_weight * np.array(snippet_similarities)) + (keywords_weight * np.array(keyword_similarities))
        closest_case_indices = np.argsort(combined_similarities)[-document_count:][::-1]

        results = [self.cases[index] for index in closest_case_indices]
        return results, closest_case_indices

## Load assets from pre-built sources

In [5]:
df = pd.read_csv("resources/main.csv")
selected_columns = [
                    'question', 'answer', 'snippet', 'keywords',
                    'question_normal_bert_matching_embeddings',
                    'question_legal_bert_matching_embeddings',
                    'question_angle_bert_matching_embeddings',
                    'question_normal_bert_retrieval_embeddings',
                    'question_legal_bert_retrieval_embeddings',
                    'question_angle_bert_retrieval_embeddings',
                    'answer_normal_bert_matching_embeddings',
                    'answer_legal_bert_matching_embeddings',
                    'answer_angle_bert_matching_embeddings',
                    'answer_normal_bert_retrieval_embeddings',
                    'answer_legal_bert_retrieval_embeddings',
                    'answer_angle_bert_retrieval_embeddings',
                    'keyword_normal_bert_matching_embeddings',
                    'keyword_legal_bert_matching_embeddings',
                    'keyword_angle_bert_matching_embeddings',
                    'keyword_normal_bert_retrieval_embeddings',
                    'keyword_legal_bert_retrieval_embeddings',
                    'keyword_angle_bert_retrieval_embeddings',
                    'snippet_normal_bert_matching_embeddings',
                    'snippet_legal_bert_matching_embeddings',
                    'snippet_angle_bert_matching_embeddings',
                    'snippet_normal_bert_retrieval_embeddings',
                    'snippet_legal_bert_retrieval_embeddings',
                    'snippet_angle_bert_retrieval_embeddings'
                    ]
main_df = df.loc[:, selected_columns].drop_duplicates(subset='question').reset_index(drop=True)
print(len(main_df))
df_test = pd.read_csv("main_test.csv")
print(len(df_test))

2084
35


In [6]:
cases_normal_bert = []
cases_legal_bert = []
cases_angle_bert = []

for index, row in tqdm(main_df.iterrows()):
  cases_normal_bert.append(Case(index, row['question'], row['question_normal_bert_matching_embeddings'], row['snippet'], row['snippet_normal_bert_retrieval_embeddings'], row['keywords'], row['keyword_normal_bert_retrieval_embeddings'], row['answer']))
  cases_legal_bert.append(Case(index, row['question'], row['question_legal_bert_matching_embeddings'], row['snippet'], row['snippet_legal_bert_retrieval_embeddings'], row['keywords'], row['keyword_legal_bert_retrieval_embeddings'], row['answer']))
  cases_angle_bert.append(Case(index, row['question'], row['question_angle_bert_matching_embeddings'], row['snippet'], row['snippet_angle_bert_retrieval_embeddings'], row['keywords'], row['keyword_angle_bert_retrieval_embeddings'], row['answer']))

normal_bert_cbr_system = CbrSystem(cases_normal_bert)
legal_bert_cbr_system = CbrSystem(cases_legal_bert)
angle_bert_cbr_system = CbrSystem(cases_angle_bert)

print('\n Cases loaded into the systems');

2084it [01:30, 22.90it/s]


 Cases loaded into the systems





## Algorithms

In [7]:
# Pipeline 2 - RAG on question
def execute_pipeline_2(cbr_system, question, question_matching_embeddings, question_retrieval_embeddings, k):
    matched_cases, indexes = cbr_system.retrieve_matches(np.array(eval(question_matching_embeddings)), np.array(eval(question_retrieval_embeddings)), k, 1, 0, 0)
    return indexes

# Pipeline 4 - RAG on snippet
def execute_pipeline_4(cbr_system, question, question_matching_embeddings, question_retrieval_embeddings, k):
    matched_cases, indexes = cbr_system.retrieve_matches(np.array(eval(question_matching_embeddings)), np.array(eval(question_retrieval_embeddings)), k, 0, 1, 0)
    return indexes

# Pipeline 6 - RAG on keywords / entities
def execute_pipeline_6(cbr_system, question, question_matching_embeddings, question_retrieval_embeddings, k):
    matched_cases, indexes = cbr_system.retrieve_matches(np.array(eval(question_matching_embeddings)), np.array(eval(question_retrieval_embeddings)), k, 0, 0, 1)
    return indexes

# Pipeline X - RAG on question / keywords / entities with dynamic weights [Used for 8, 10, 12 pipelines]
def execute_pipeline_x(cbr_system, question, question_matching_embeddings, question_retrieval_embeddings, k, w1, w2, w3):
    matched_cases, indexes = cbr_system.retrieve_matches(np.array(eval(question_matching_embeddings)), np.array(eval(question_retrieval_embeddings)), k, w1, w2, w3)
    return indexes

# Run simulator

In [8]:
k = 2084

In [None]:
new_df = pd.DataFrame(columns=['index'])

i = 0
for index, row in df_test.iterrows():
    row['case_index'] = index

    row['normal_bert_pipeline_2_case_indexes'] = execute_pipeline_2(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k)
    row['normal_bert_pipeline_4_case_indexes'] = execute_pipeline_4(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k)
    row['normal_bert_pipeline_6_case_indexes'] = execute_pipeline_6(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k)
    row['normal_bert_pipeline_8_case_indexes'] = execute_pipeline_x(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k, 0.2, 0.4, 0.4)
    row['normal_bert_pipeline_10_case_indexes'] = execute_pipeline_x(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k, 0.3, 0.4, 0.3)
    row['normal_bert_pipeline_12_case_indexes'] = execute_pipeline_x(normal_bert_cbr_system, row['question'], row['question_normal_bert_matching_embeddings'], row['question_normal_bert_retrieval_embeddings'], k, 0.25, 0.4, 0.35)

    row['legal_bert_pipeline_2_case_indexes'] = execute_pipeline_2(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k)
    row['legal_bert_pipeline_4_case_indexes'] = execute_pipeline_4(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k)
    row['legal_bert_pipeline_6_case_indexes'] = execute_pipeline_6(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k)
    row['legal_bert_pipeline_8_case_indexes'] = execute_pipeline_x(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k, 0.2, 0.4, 0.4)
    row['legal_bert_pipeline_10_case_indexes'] = execute_pipeline_x(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k, 0.3, 0.4, 0.3)
    row['legal_bert_pipeline_12_case_indexes'] = execute_pipeline_x(legal_bert_cbr_system, row['question'], row['question_legal_bert_matching_embeddings'], row['question_legal_bert_retrieval_embeddings'], k, 0.25, 0.4, 0.35)

    row['angle_bert_pipeline_2_case_indexes'] = execute_pipeline_2(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k)
    row['angle_bert_pipeline_4_case_indexes'] = execute_pipeline_4(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k)
    row['angle_bert_pipeline_6_case_indexes'] = execute_pipeline_6(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k)
    row['angle_bert_pipeline_8_case_indexes'] = execute_pipeline_x(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k, 0.2, 0.4, 0.4)
    row['angle_bert_pipeline_10_case_indexes'] = execute_pipeline_x(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k, 0.3, 0.4, 0.3)
    row['angle_bert_pipeline_12_case_indexes'] = execute_pipeline_x(angle_bert_cbr_system, row['question'], row['question_angle_bert_matching_embeddings'], row['question_angle_bert_retrieval_embeddings'], k, 0.25, 0.4, 0.35)

    new_df = new_df.append(row)

In [10]:
new_df

Unnamed: 0,index,question,answer,original_texts,question_normal_bert_matching_embeddings,question_legal_bert_matching_embeddings,question_angle_bert_matching_embeddings,question_normal_bert_retrieval_embeddings,question_legal_bert_retrieval_embeddings,question_angle_bert_retrieval_embeddings,...,legal_bert_pipeline_6_case_indexes,legal_bert_pipeline_8_case_indexes,legal_bert_pipeline_10_case_indexes,legal_bert_pipeline_12_case_indexes,angle_bert_pipeline_2_case_indexes,angle_bert_pipeline_4_case_indexes,angle_bert_pipeline_6_case_indexes,angle_bert_pipeline_8_case_indexes,angle_bert_pipeline_10_case_indexes,angle_bert_pipeline_12_case_indexes
0,,In the case Econ Holdings Pty Ltd v Sims Lockw...,In the case Econ Holdings Pty Ltd v Sims Lockw...,[114 169],"[0.10423540323972702, -0.1665179282426834, -0....","[0.0027042031288146973, -0.12522736191749573, ...","[-0.3710246682167053, -0.30695846676826477, -0...","[0.09509215503931046, -0.2021857500076294, -0....","[-0.0023545799776911736, -0.10421659797430038,...","[-0.4049617350101471, -0.05314958840608597, -0...",...,"[640, 1451, 741, 1611, 1311, 599, 1897, 1803, ...","[106, 761, 242, 1603, 1311, 1389, 599, 1803, 6...","[106, 1603, 761, 1311, 242, 1800, 1950, 810, 2...","[106, 761, 1603, 242, 1311, 1389, 1950, 258, 5...","[1591, 1259, 264, 1085, 1717, 1926, 1850, 2001...","[1389, 861, 1484, 738, 867, 326, 2029, 2047, 5...","[1897, 640, 1145, 1950, 597, 425, 114, 1213, 4...","[1145, 1950, 640, 597, 1845, 1897, 2047, 1389,...","[1145, 1950, 640, 597, 1845, 1075, 264, 1259, ...","[1145, 1950, 640, 597, 1845, 1075, 2047, 114, ..."
1,,Considering the balance of public interest and...,The details in Events 38832023 and 3860136 inc...,[270 347],"[-0.019197409972548485, -0.0016128727002069354...","[-0.12920181453227997, 0.08838242292404175, 0....","[0.624129056930542, -0.3914183974266052, -0.04...","[0.02651604823768139, -0.013784815557301044, -...","[-0.05422603711485863, 0.1209280863404274, -0....","[0.36398348212242126, -0.16158992052078247, -0...",...,"[20, 1920, 540, 495, 1606, 1528, 1451, 726, 88...","[1920, 1606, 1430, 660, 1451, 495, 934, 1191, ...","[1430, 1606, 1920, 660, 180, 1191, 1451, 934, ...","[1430, 1606, 1920, 660, 1451, 180, 1191, 934, ...","[270, 933, 1291, 1534, 1633, 1243, 366, 501, 3...","[347, 976, 369, 307, 1743, 628, 380, 677, 933,...","[1822, 347, 837, 883, 2007, 687, 371, 206, 123...","[347, 369, 933, 976, 837, 1720, 206, 307, 1648...","[347, 369, 933, 976, 1720, 837, 270, 307, 206,...","[347, 369, 933, 976, 837, 1720, 307, 206, 270,..."
2,,In two separate cases heard in the Supreme Cou...,In the case of Mekhail v Hana; Mekail v Hana; ...,[ 388 1662],"[0.02596280165016651, -0.07136604189872742, -0...","[-0.032598309218883514, 0.283461332321167, 0.0...","[-0.322487473487854, -0.10779520124197006, -0....","[0.06910744309425354, -0.13112026453018188, -0...","[-0.0018813080387189984, 0.2632799446582794, 0...","[-0.3880188763141632, 0.0675685852766037, -0.7...",...,"[124, 1444, 1451, 1417, 741, 551, 744, 1357, 1...","[488, 124, 81, 1367, 805, 1950, 22, 1444, 1897...","[488, 1367, 81, 124, 805, 22, 1950, 1741, 1897...","[488, 81, 124, 1367, 805, 22, 1950, 1444, 1741...","[1850, 591, 334, 1833, 580, 72, 688, 1978, 113...","[867, 1145, 1781, 1144, 1235, 2047, 1263, 1259...","[1145, 1605, 1844, 1213, 2047, 1614, 1519, 113...","[1145, 2047, 1138, 1259, 275, 1844, 1484, 1748...","[1145, 2047, 1138, 1259, 275, 1845, 1263, 1166...","[1145, 2047, 1138, 1259, 275, 1845, 1166, 254,..."
3,,"In the context of the Federal Court Rules, how...","According to the judgment of Gordon J, an amen...",[1881 1881],"[0.05399247631430626, 0.03763565793633461, -0....","[-0.0940106064081192, 0.031954631209373474, 0....","[-0.08438273519277573, -0.07412683963775635, -...","[0.0569213330745697, -0.042861513793468475, -0...","[-0.09847217798233032, 0.03212566673755646, -0...","[-0.20830093324184418, 0.040662266314029694, -...",...,"[1451, 1831, 714, 1417, 1315, 1126, 1858, 794,...","[1881, 810, 1305, 1451, 563, 989, 1126, 1858, ...","[1881, 810, 1305, 563, 989, 488, 176, 1858, 18...","[1881, 810, 1305, 563, 989, 1451, 488, 1126, 1...","[1881, 342, 1907, 563, 1311, 1394, 369, 1304, ...","[1747, 761, 1881, 810, 1781, 342, 563, 1600, 5...","[2026, 1981, 1605, 126, 28, 342, 810, 1537, 41...","[1881, 342, 810, 2026, 1747, 563, 1981, 56, 14...","[1881, 342, 810, 1747, 563, 1394, 761, 2026, 1...","[1881, 342, 810, 1747, 563, 2026, 56, 1403, 19..."
4,,What are the specific models of Hiller UH-12 S...,"For the Hiller UH-12 Series Helicopters, the A...",[475 762],"[0.027825091034173965, 0.30563709139823914, 0....","[-0.19913476705551147, 0.2747645676136017, 0.3...","[0.2879544496536255, 0.6913779973983765, 0.732...","[-0.005272657610476017, 0.23753677308559418, 0...","[-0.2159091681241989, 0.2196102738380432, 0.19...","[0.2321493923664093, 0.7232281565666199, 0.550...",...,"[66, 475, 1018, 1707, 536, 128, 236, 1471, 121...","[1018, 905, 1707, 1486, 1466, 66, 1422, 475, 1...","[905, 1018, 1486, 1810, 1422, 1466, 1928, 1276...","[1018, 905, 1486, 1466, 1422, 1707, 1928, 1810...","[593, 1018, 882, 665, 1486, 1560, 1335, 1267, ...","[475, 1422, 1267, 2041, 1192, 593, 1229, 1270,...","[475, 2063, 1486, 1996, 1540, 1192, 905, 1707,...","[475, 1422, 1486, 2063, 1267, 1192, 905, 1707,...","[1422, 1486, 1267, 475, 2063, 593, 905, 1192, ...","[1422, 1486, 475, 1267, 2063, 1192, 593, 905, ..."
5,,How does the sentencing structure differ under...,The old form of Section 44 of the Crimes (Sent...,[ 566 1968],"[-0.086128830909729, -0.10069213807582855, -0....","[-0.08524221926927567, 0.03350608050823212, 0....","[-0.1289568543434143, -0.28314706683158875, -0...","[-0.01946346089243889, -0.12746095657348633, -...","[-0.07383330911397934, 0.0236778873950243, -0....","[-0.17468424141407013, -0.15482939779758453, -...",...,"[1906, 1968, 566, 228, 790, 1009, 452, 1451, 7...","[566, 1968, 1009, 1333, 1163, 779, 857, 1451, ...","[566, 1968, 1163, 1009, 1333, 779, 857, 1430, ...","[566, 1968, 1009, 1163, 1333, 779, 857, 1451, ...","[566, 709, 1493, 1163, 257, 1089, 589, 720, 89...","[566, 1163, 1493, 1889, 834, 228, 608, 1162, 1...","[1968, 566, 228, 364, 1465, 878, 1163, 1561, 8...","[566, 1163, 228, 1968, 1493, 364, 132, 790, 11...","[566, 1163, 1493, 228, 1968, 364, 132, 290, 11...","[566, 1163, 228, 1493, 1968, 364, 132, 790, 11..."
6,,What are the specific requirements for Boeing ...,"For Boeing 747 Series Aeroplanes, the Airworth...",[1466 1486],"[0.15021291375160217, 0.2642771303653717, 0.06...","[0.17781013250350952, 0.28141024708747864, 0.2...","[0.6273146867752075, 0.4227374792098999, 0.174...","[0.09855014085769653, 0.22745323181152344, 0.1...","[0.1508861631155014, 0.25192752480506897, 0.13...","[0.4547281265258789, 0.5376385450363159, 0.047...",...,"[475, 1018, 1707, 236, 1059, 1486, 128, 66, 12...","[1486, 1018, 1267, 1707, 1059, 475, 1276, 1466...","[1486, 1267, 1276, 1018, 1059, 1466, 905, 1810...","[1486, 1267, 1018, 1276, 1059, 1707, 1466, 475...","[1267, 974, 1192, 1236, 1486, 378, 1176, 1495,...","[1495, 1486, 475, 1540, 1401, 1761, 1896, 1707...","[1267, 1486, 475, 1540, 1516, 1896, 1707, 1508...","[1486, 1267, 475, 1540, 1495, 1707, 1072, 1761...","[1486, 1267, 475, 1540, 1495, 1707, 1072, 1761...","[1486, 1267, 475, 1540, 1495, 1707, 1072, 1761..."
7,,What are the key differences in the applicabil...,The Airworthiness Directive (AD) for Fokker F2...,[1508 1928],"[-0.00017284211935475469, 0.25689345598220825,...","[-0.10534936189651489, 0.05295746773481369, 0....","[0.42332953214645386, 0.29551276564598083, 0.1...","[-0.015311392024159431, 0.1417405903339386, 0....","[-0.11512213945388794, 0.07270382344722748, 0....","[0.17148461937904358, 0.33494460582733154, 0.0...",...,"[66, 1018, 236, 1059, 475, 536, 1707, 855, 148...","[1018, 1059, 66, 1707, 1236, 1486, 1928, 475, ...","[1018, 1059, 1236, 1486, 1707, 1928, 66, 1124,...","[1018, 1059, 1236, 1707, 66, 1486, 1928, 1124,...","[1284, 1540, 1192, 1236, 1466, 1508, 1495, 193...","[1508, 1540, 236, 1495, 199, 1401, 1018, 557, ...","[1540, 236, 1508, 1236, 1018, 1896, 1707, 1637...","[1540, 1508, 236, 1018, 1637, 1401, 1707, 1072...","[1540, 1508, 236, 1018, 1072, 1637, 475, 1284,...","[1540, 1508, 236, 1018, 1072, 1637, 1707, 1401..."
8,,In the judgments made by Commissioner Susan Di...,Both Commissioner Susan Dixon and Commissioner...,[897 907],"[0.025714773684740067, -0.043090883642435074, ...","[0.03737148270010948, 0.30224883556365967, 0.1...","[0.0023732660338282585, -0.1447940170764923, -...","[0.05698379501700401, -0.07684947550296783, 0....","[0.03528956323862076, 0.28450819849967957, 0.0...","[-0.12175329774618149, 0.027633829042315483, -...",...,"[885, 1451, 1139, 1444, 1202, 20, 2005, 1105, ...","[1968, 1451, 63, 488, 1139, 940, 885, 1191, 13...","[1968, 63, 488, 1451, 1088, 1961, 1191, 1897, ...","[1968, 63, 1451, 488, 1088, 1191, 1961, 1139, ...","[658, 1311, 1675, 994, 1914, 372, 903, 97, 196...","[761, 1781, 1856, 1821, 1747, 1891, 677, 1145,...","[1288, 651, 907, 954, 1794, 1556, 390, 1139, 2...","[907, 1856, 1145, 1556, 1587, 342, 1981, 390, ...","[907, 1856, 1145, 342, 1311, 1587, 1556, 372, ...","[907, 1856, 1145, 342, 1587, 1556, 1311, 1981,..."
9,,In the two separate development application ca...,"In the first case, Carlton Crescent Developmen...",[944 961],"[-0.2238706797361374, -0.007589583285152912, -...","[0.00920430850237608, 0.2334461510181427, 0.13...","[0.16463066637516022, 0.037474364042282104, -0...","[-0.11909535527229309, -0.041183918714523315, ...","[-0.03520782291889191, 0.12481970340013504, 0....","[0.008392482995986938, 0.16243430972099304, -0...",...,"[885, 635, 949, 643, 940, 1103, 1856, 1451, 18...","[1088, 1230, 889, 1451, 806, 892, 1191, 63, 33...","[1088, 1230, 889, 806, 892, 1451, 63, 1191, 32...","[1088, 1230, 889, 806, 1451, 892, 63, 1191, 33...","[1274, 119, 1171, 155, 1144, 944, 1523, 98, 19...","[2082, 1306, 1798, 931, 898, 1051, 70, 155, 11...","[889, 1271, 1171, 1145, 254, 1428, 109, 118, 1...","[1171, 944, 961, 2082, 155, 1306, 1145, 889, 6...","[1171, 944, 961, 155, 2082, 1306, 1845, 931, 3...","[1171, 944, 961, 155, 2082, 1306, 651, 1845, 1..."


In [11]:
!huggingface-cli login --token XXX
new_df.reset_index(drop=True, inplace=True)
output_dataset = DatasetDict({'XXX': Dataset.from_pandas(new_df)})
output_dataset.push_to_hub('XXX')