In [24]:
from datasets import load_dataset

In [42]:
# Function to print sample rows with flexible column handling
def print_sample(dataset, dataset_name, num_samples=3):
    print(f"--- Sample from {dataset_name} ---")
    for i in range(num_samples):
        print(f"Sample {i + 1}:")
        
        # Check and print the 'text' or 'judgement' column if they exist
        if 'text' in dataset[i]:
            text_sample = dataset[i]['text'][:250] if isinstance(dataset[i]['text'], str) else dataset[i]['text']
            print("Text:", text_sample, "...")
        elif 'judgement' in dataset[i]:
            judgement_sample = dataset[i]['judgement'][:250] if isinstance(dataset[i]['judgement'], str) else dataset[i]['judgement']
            print("Judgement:", judgement_sample, "...")
        
        # Print 'summary' column if it exists, or use 'labels' if 'summary' is absent
        if 'summary' in dataset[i]:
            summary_sample = dataset[i]['summary'][:250] if isinstance(dataset[i]['summary'], str) else dataset[i]['summary']
            print("Summary:", summary_sample, "...")
        elif 'labels' in dataset[i]:
            labels_sample = dataset[i]['labels']
            if isinstance(labels_sample, list):
                print("Labels:", labels_sample[:250], "...")
            else:
                print("Labels:", labels_sample, "...")
        
        # Print 'dataset_name' if available
        if 'dataset_name' in dataset[i]:
            print("Dataset Name:", dataset[i]['dataset_name'])

        print()





In [26]:
# code for length checking the 'text' column of a dataset
def check_text_lengths(dataset, name):
    lengths = [len(text) for text in dataset['text']]  # Adjust key if needed
    print(f"{name} - Min length: {min(lengths)}, Max length: {max(lengths)}, Mean length: {sum(lengths) / len(lengths)}")
    return lengths

In [27]:
# Code for length checking the 'labels' column of a dataset
def check_labels_lengths(dataset, name):
    lengths = []
    for label in dataset['labels']:
        if isinstance(label, (list, str)):  # If label is a list or string, we can compute its length
            lengths.append(len(label))
        else:
            lengths.append(1)  # If it's an int or scalar, treat the length as 1
    
    # Print the min, max, and mean length of the labels
    print(f"{name} - Min length: {min(lengths)}, Max length: {max(lengths)}, Mean length: {sum(lengths) / len(lengths)}")
    
    return lengths


In [28]:
# Check for missing data
def check_missing_data(dataset, name):
    missing = [i for i, text in enumerate(dataset['text']) if not text]  # Adjust key if needed
    print(f"{name} - Missing entries: {len(missing)}")
    if missing:
        print(f"Indices with missing entries: {missing}")

| Dataset          | Source                        | Sub-domain | Task Type                     | Classes |
|------------------|-------------------------------|------------|-------------------------------|---------|
| ECtHR (Task A)    | Chalkidis et al. (2019)        | ECHR       | Multi-label classification     | 10+1    |
| ECtHR (Task B)    | Chalkidis et al. (2021a)       | ECHR       | Multi-label classification     | 10+1    |
| SCOTUS           | Spaeth et al. (2020)           | US Law     | Multi-class classification     | 14      |
| EUR-LEX          | Chalkidis et al. (2021b)       | EU Law     | Multi-label classification     | 100     |
| LEDGAR           | Tuggener et al. (2020)         | Contracts  | Multi-class classification     | 100     |
| UNFAIR-ToS       | Lippi et al. (2019)            | Contracts  | Multi-label classification     | 8+1     |
| CaseHOLD         | Zheng et al. (2021)            | US Law     | Multiple choice QA             | n/a     |


In [29]:
# Load the training and testing data

In [30]:
# Dataset 1: Joel Niklaus Legal Case Document Summarization (Train and Test)
ds1_train = load_dataset("joelniklaus/legal_case_document_summarization", split='train')
ds1_train = ds1_train.remove_columns(['dataset_name'])
ds1_train = ds1_train.rename_column('judgement', 'text')
ds1_train = ds1_train.rename_column('summary', 'labels')

ds1_test = load_dataset("joelniklaus/legal_case_document_summarization", split='test')
ds1_test = ds1_test.remove_columns(['dataset_name'])
ds1_test = ds1_test.rename_column('judgement', 'text')
ds1_test = ds1_test.rename_column('summary', 'labels')

# Dataset 2: Manasvi Kalyan Legal Documents Summary (Only 50 rows)
ds2 = load_dataset("manasvikalyan/legal-documents-summary")
ds2 = ds2['data']
ds2 = ds2.remove_columns(['summary_a2'])
ds2 = ds2.rename_column('summary_a1', 'labels')
ds2 = ds2.rename_column('judgement', 'text')

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [None]:
# ds3: LexGLUE - Case_hold (might not be suitable for summarization)
ds3_train = load_dataset("coastalcph/lex_glue", "case_hold", split='train')
ds3_train = ds3_train.rename_column('label', 'labels')
ds3_test = load_dataset("coastalcph/lex_glue", "case_hold", split='test')
ds3_test = ds3_test.rename_column('label', 'labels')

# ds4: LexGLUE - ecthr_a
ds4_train = load_dataset("coastalcph/lex_glue", "ecthr_a", split='train')
ds4_test = load_dataset("coastalcph/lex_glue", "ecthr_a", split='test')

# ds5: LexGLUE - ecthr_b
ds5_train = load_dataset("coastalcph/lex_glue", "ecthr_b", split='train')
ds5_test = load_dataset("coastalcph/lex_glue", "ecthr_b", split='test')

# ds6: LexGLUE - eurlex
ds6_train = load_dataset("coastalcph/lex_glue", "eurlex", split='train')
ds6_test = load_dataset("coastalcph/lex_glue", "eurlex", split='test')

# ds7: LexGLUE - ledgar
ds7_train = load_dataset("coastalcph/lex_glue", "ledgar", split='train')
ds7_train = ds7_train.rename_column('label', 'labels')
ds7_test = load_dataset("coastalcph/lex_glue", "ledgar", split='test')
ds7_test = ds7_test.rename_column('label', 'labels')

# ds8: LexGLUE - scotus
ds8_train = load_dataset("coastalcph/lex_glue", "scotus", split='train')
ds8_train = ds8_train.rename_column('label', 'labels')
ds8_test = load_dataset("coastalcph/lex_glue", "scotus", split='test')
ds8_test = ds8_test.rename_column('label', 'labels')

In [31]:
# ds9: AjayMukundS/Legal_Text_Summarization-llama2
ds9_train = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split='train')
ds9_test = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split='test')

Generating train split: 100%|████████████| 7773/7773 [00:00<00:00, 9076.83 examples/s]
Generating test split: 100%|███████████████| 200/200 [00:00<00:00, 5113.91 examples/s]


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [32]:
# Print the structures of the datasets
print(ds1_train)
print(ds2)

Dataset({
    features: ['text', 'labels'],
    num_rows: 7773
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 50
})


In [None]:
print(ds3_train)
print(ds4_train)
print(ds5_train)
print(ds6_train)
print(ds7_train)
print(ds8_train)

In [33]:
print(ds9_train)

Dataset({
    features: ['judgement', 'dataset_name', 'summary', 'text'],
    num_rows: 7773
})


In [34]:
# Print samples from the datasets
print_sample(ds1_train, "ds1_train")
print_sample(ds2, "ds2")

--- Sample from ds1_train ---
Sample 1:
Text: Appeal No. LXVI of 1949.
Appeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022.
K.M. Munshi (N. P. Nathvani, with him), for the appel lant. ' M.C. Setalvad, Attorney General for India (H. ...
Labels: The charge created in respect of municipal property tax by section 212 of the City of Bombay Municipal Act, 1888, is an "annual charge not being a capital charge" within the mean ing of section 9 (1) (iv) of the Indian Income tax Act, 199.2, and the amount of such charge should therefore be deducted in computing the income from such property for the purposes of section 9 of the Indian Income tax Act.
The charge in respect of urban immoveable property tax created by the Bombay Finance Act, 1939 is similar in character and the amount of such charge should also be deducted.
The expression "capital charge" in s.9(1) (iv) means a charge created for a capital sum,that is to say, a charge 

In [None]:
# print_sample(ds3_train, "ds3_train")
print_sample(ds4_train, "ds4_train")
print_sample(ds5_train, "ds5_train")
print_sample(ds6_train, "ds6_train")
print_sample(ds7_train, "ds7_train")
print_sample(ds8_train, "ds8_train")

In [43]:
print_sample(ds9_train, "ds9_train")

--- Sample from ds9_train ---
Sample 1:
Text: <s>[INST] Summarize the following judgement: Appeal No. LXVI of 1949.
Appeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022.
K.M. Munshi (N. P. Nathvani, with him), for the appel lant. ' M.C. Setalvad, Attorney General for India (H. J. Umrigar, with him), for the respondent. 1950.
May 26.
The judgment of the Court was delivered by MEHR CHAND MAHAJAN J.
This is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 (1) (iv) of the Indian Income tax Act.
The assessee company is an investment company deriving its income from properties in the city of Bombay.
For the assessment year 1940 41 the net income of the assessee under the head "property" was computed by the Income 

In [None]:
print(ds1_train.column_names)
print(ds2.column_names)

In [None]:
print(ds3_train.column_names)
print(ds4_train.column_names)
print(ds5_train.column_names)
print(ds6_train.column_names)
print(ds7_train.column_names)
print(ds8_train.column_names)

In [36]:
print(ds9_train.column_names)

['judgement', 'dataset_name', 'summary', 'text']


In [39]:
# Check the lengths of the text columns of the datasets
ds1_text_lengths = check_text_lengths(ds1_train, "ds1_train")
ds2_text_lengths = check_text_lengths(ds2, "ds2")

ds1_train - Min length: 593, Max length: 808119, Mean length: 30381.683648526952
ds2 - Min length: 16809, Max length: 90849, Mean length: 30566.04


In [None]:
ds4_text_lengths = check_text_lengths(ds4_train, "ds4_train")
ds5_text_lengths = check_text_lengths(ds5_train, "ds5_train")
ds6_text_lengths = check_text_lengths(ds6_train, "ds6_train")
ds7_text_lengths = check_text_lengths(ds7_train, "ds7_train")
ds8_text_lengths = check_text_lengths(ds8_train, "ds8_train")

In [40]:
ds9_text_lengths = check_text_lengths(ds9_train, "ds9_train")

ds9_train - Min length: 1444, Max length: 966647, Mean length: 35458.08413739869


In [None]:
# Check the lengths of the labels columns of the datasets
ds1_labels_lengths = check_labels_lengths(ds1_train, "ds1_train")
ds2_labels_lengths = check_labels_lengths(ds2, "ds2")

In [None]:
ds4_labels_lengths = check_labels_lengths(ds4_train, "ds4_train")
ds5_labels_lengths = check_labels_lengths(ds5_train, "ds5_train")
ds6_labels_lengths = check_labels_lengths(ds6_train, "ds6_train")
ds7_labels_lengths = check_labels_lengths(ds7_train, "ds7_train")
ds8_labels_lengths = check_labels_lengths(ds8_train, "ds8_train")

In [68]:
# ds9_labels_lengths = check_labels_lengths(ds9_train, "ds9_train")

In [None]:
check_missing_data(ds1_train, "ds1_train")
check_missing_data(ds2, "ds2")

In [None]:
check_missing_data(ds4_train, "ds4_train")
check_missing_data(ds5_train, "ds5_train")
check_missing_data(ds6_train, "ds6_train")
check_missing_data(ds7_train, "ds7_train")
check_missing_data(ds8_train, "ds8_train")

In [44]:
check_missing_data(ds9_train, "ds9_train")

ds9_train - Missing entries: 0


In [60]:
def analyze_column_types(dataset, name):
    column_types = {}

    # Helper function to analyze the types within a list, including nested lists
    def analyze_list_types(lst):
        element_types = {}
        for item in lst:
            item_type = type(item).__name__
            # If it's a list, check the types within the list
            if isinstance(item, list):
                nested_list_element_types = analyze_list_types(item)
                list_type_str = f"List[{', '.join(list(nested_list_element_types.keys()))}]"
                element_types[list_type_str] = element_types.get(list_type_str, 0) + 1
            else:
                element_types[item_type] = element_types.get(item_type, 0) + 1
        return element_types

    # Iterate through all columns in the dataset
    for column_name in dataset.column_names:
        column_data = dataset[column_name]
        column_type_summary = {}

        # Analyze the types within each column
        for entry in column_data:
            entry_type = type(entry).__name__
            if isinstance(entry, list):
                # Handle lists, including nested lists
                list_element_types = analyze_list_types(entry)
                list_type_str = f"List[{', '.join(list(list_element_types.keys()))}]"
                column_type_summary[list_type_str] = column_type_summary.get(list_type_str, 0) + 1
            else:
                column_type_summary[entry_type] = column_type_summary.get(entry_type, 0) + 1

        column_types[column_name] = column_type_summary

    # Print the analysis for each column
    print(f"--- {name} Column Types and Data Structures Analysis ---")
    for column_name, type_summary in column_types.items():
        print(f"\nColumn: {column_name}")
        for entry_type, count in type_summary.items():
            print(f"  {entry_type}: {count} entries")

    return 0


In [62]:
# Analyze the types of 'text' and 'labels' columns in different datasets
analyze_column_types(ds1_train, "ds1_train")
analyze_column_types(ds2, "ds2")

--- ds1_train Column Types and Data Structures Analysis ---

Column: text
  str: 7773 entries

Column: labels
  str: 7773 entries
--- ds2 Column Types and Data Structures Analysis ---

Column: text
  str: 50 entries

Column: labels
  str: 50 entries


0

In [None]:
analyze_column_types(ds4_train, "ds4_train")
analyze_column_types(ds5_train, "ds5_train")
analyze_column_types(ds6_train, "ds6_train")
analyze_column_types(ds7_train, "ds7_train")
analyze_column_types(ds8_train, "ds8_train")

In [64]:
analyze_column_types(ds9_train, "ds9_train")

--- ds9_train Column Types and Data Structures Analysis ---

Column: judgement
  str: 7773 entries

Column: dataset_name
  str: 7773 entries

Column: summary
  str: 7773 entries

Column: text
  str: 7773 entries


0

### Dataset Compatibility Analysis

Based on the types of `text` and `labels` in the various datasets, here is the analysis of which datasets can be combined:

#### 1. **ds1_train** and **ds2**
- **Text**: Both datasets have `str` as the type for the `text` column, making them compatible for combining.
- **Labels**: Both datasets have `str` as the type for the `labels` column. While it's unusual to use `str` for labels, these two datasets are compatible due to the matching types.
- **Conclusion**: These datasets can be combined directly.

#### 2. **ds4_train** and **ds5_train**
- **Text**: Both datasets have `list of ['str']` as the type for the `text` column, making them compatible.
- **Labels**: Both datasets have `list of ['int']` as the dominant type for the `labels` column, though **ds4_train** has 914 entries with `list of []` and **ds5_train** has 134 entries with `list of []`. These empty lists in the labels will need to be handled.
- **Conclusion**: These datasets can be combined, but the empty lists in the `labels` column should be addressed.

#### 3. **ds6_train**
- **Text**: The `text` column is `str`, so it is compatible with **ds1_train**, **ds2**, **ds7_train**, and **ds8_train**.
- **Labels**: Most entries in the `labels` column are `list of ['int']`, but there are 4 entries with `list of []`. These empty label lists need to be handled before combining with other datasets.
- **Conclusion**: This dataset can be combined with **ds4_train** and **ds5_train** (after handling empty lists), but not directly with **ds1_train** or **ds2** unless the `str` labels are converted to `int`.

#### 4. **ds7_train** and **ds8_train**
- **Text**: Both datasets have `str` as the type for the `text` column, making them compatible for combining.
- **Labels**: Both datasets have `int` as the type for the `labels` column, so they are compatible.
- **Conclusion**: These datasets can be combined directly.

### Recommendations:
- **Combine these datasets:**
  - **ds1_train** + **ds2**: Same `str` type for text and labels.
  - **ds7_train** + **ds8_train**: Same `str` text and `int` labels.
  - **ds4_train** + **ds5_train**: Same `list of ['str']` text and `list of ['int']` labels (after handling empty label lists).

- **Handle empty lists** in the `labels` columns of **ds4_train**, **ds5_train**, and **ds6_train** before combining them with other datasets.

- **Consider converting `labels`** in **ds1_train** and **ds2** from `str` to `int` if combining them with **ds6_train**, **ds7_train**, or **ds8_train**.


In [67]:
# modified to print more of the actual samples extracted from ds1, ds2, and ds9
def print_sample(dataset, dataset_name, num_samples=3):
    print(f"--- Sample from {dataset_name} ---")
    for i in range(num_samples):
        print(f"Sample {i + 1}:")
        
        # Check and print the 'text' or 'judgement' column if they exist
        if 'text' in dataset[i]:
            text_sample = dataset[i]['text'][:10000] if isinstance(dataset[i]['text'], str) else dataset[i]['text']
            print("Text:", text_sample, "...")
        elif 'judgement' in dataset[i]:
            judgement_sample = dataset[i]['judgement'][:10000] if isinstance(dataset[i]['judgement'], str) else dataset[i]['judgement']
            print("Judgement:", judgement_sample, "...")
        
        # Print 'summary' column if it exists, or use 'labels' if 'summary' is absent
        if 'summary' in dataset[i]:
            summary_sample = dataset[i]['summary'][:10000] if isinstance(dataset[i]['summary'], str) else dataset[i]['summary']
            print("Summary:", summary_sample, "...")
        elif 'labels' in dataset[i]:
            labels_sample = dataset[i]['labels']
            if isinstance(labels_sample, list):
                print("Labels:", labels_sample[:10000], "...")
            else:
                print("Labels:", labels_sample, "...")
        
        # Print 'dataset_name' if available
        if 'dataset_name' in dataset[i]:
            print("Dataset Name:", dataset[i]['dataset_name'])

        print()

In [66]:
# Further Analysis of ds1, ds2, and ds9
print(ds1_train.column_names)
print(ds2.column_names)
print(ds9_train.column_names)

['text', 'labels']
['text', 'labels']
['judgement', 'dataset_name', 'summary', 'text']


In [71]:
# Print samples from the datasets
print_sample(ds1_train, "ds1_train")
print_sample(ds2, "ds2")

--- Sample from ds1_train ---
Sample 1:
Text: Appeal No. LXVI of 1949.
Appeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022.
K.M. Munshi (N. P. Nathvani, with him), for the appel lant. ' M.C. Setalvad, Attorney General for India (H. J. Umrigar, with him), for the respondent. 1950.
May 26.
The judgment of the Court was delivered by MEHR CHAND MAHAJAN J.
This is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 (1) (iv) of the Indian Income tax Act.
The assessee company is an investment company deriving its income from properties in the city of Bombay.
For the assessment year 1940 41 the net income of the assessee under the head "property" was computed by the Income tax Officer in the sum of Rs. 6,21,764 after 

In [77]:
print_sample(ds9_train, "ds9_train")

--- Sample from ds9_train ---
Sample 1:
Text: <s>[INST] Summarize the following judgement: Appeal No. LXVI of 1949.
Appeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022.
K.M. Munshi (N. P. Nathvani, with him), for the appel lant. ' M.C. Setalvad, Attorney General for India (H. J. Umrigar, with him), for the respondent. 1950.
May 26.
The judgment of the Court was delivered by MEHR CHAND MAHAJAN J.
This is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 (1) (iv) of the Indian Income tax Act.
The assessee company is an investment company deriving its income from properties in the city of Bombay.
For the assessment year 1940 41 the net income of the assessee under the head "property" was computed by the Income 

In [75]:
# Updated function to handle both pandas DataFrames and other Dataset objects (e.g., from Hugging Face)
def compare_text_samples(ds1, ds2, ds9, sample_index=0):
    """
    Compare a text entry from each dataset and print the text for analysis.
    
    Args:
    - ds1 (Dataset or pd.DataFrame): First dataset with a 'text' column.
    - ds2 (Dataset or pd.DataFrame): Second dataset with a 'text' column.
    - ds9 (Dataset or pd.DataFrame): Third dataset with a 'text' column (legal dataset).
    - sample_index (int): The index of the sample to compare (default is 0).
    
    Returns:
    None: Prints out text samples from each dataset.
    """

    def get_text_from_dataset(dataset, index, column_name='text'):
        """Helper function to extract text from different types of datasets"""
        if isinstance(dataset, pd.DataFrame):
            # If it's a pandas DataFrame, extract the text from the specified column
            return dataset.loc[index, column_name] if column_name in dataset.columns else f"No '{column_name}' column in dataset"
        elif hasattr(dataset, 'column_names') and column_name in dataset.column_names:
            # If it's a Dataset object (like Hugging Face), extract the text
            return dataset[index][column_name]
        else:
            return f"No '{column_name}' column in dataset"
    
    # Extract text from each dataset
    ds1_text = get_text_from_dataset(ds1, sample_index, 'text')
    ds2_text = get_text_from_dataset(ds2, sample_index, 'text')
    ds9_text = get_text_from_dataset(ds9, sample_index, 'text')
    
    # Print the sample texts for comparison
    print(f"Sample from ds1 (Index {sample_index}):\n")
    print(ds1_text)
    print("\n" + "="*80 + "\n")
    
    print(f"Sample from ds2 (Index {sample_index}):\n")
    print(ds2_text)
    print("\n" + "="*80 + "\n")
    
    print(f"Sample from ds9 (Index {sample_index}):\n")
    print(ds9_text)
    print("\n" + "="*80 + "\n")

# Usage example (replace ds1_train, ds2, and ds9_train with your actual datasets):
# compare_text_samples(ds1_train, ds2, ds9_train, sample_index=8)


In [76]:
compare_text_samples(ds1_train, ds2, ds9_train, 8)

Sample from ds1 (Index 8):

Appeal No. 198 of 1954.
Appeal from the judgment and order dated October 16, 1952, of the former Nagpur High Court in Misc.
; No. 1231 of 1951.
M. section K. Sastri, for the appellant.
H. L. Khaskalam, B. K. B. Naidu and I. N. Shroff, for the respondent.
64 502 1960.
November 18.
The Judgment of the Court was delivered by IMAM, J.
This is an appeal from the judgment of the Nagpur High Court dismissing the appellants petition under articles 226 and 227 of the Constitution of India.
The High Court certified under article 132(1) of the Constitution that the case involved a substantial question of law as to the interpretation of the Constitution.
Hence the present appeal.
The appellant was the Ruler of the State of Baster.
After the passing of the Indian Independence Act, 1947, the appellant executed an Instrument of Accession to the Dominion of India on August 14, 1947.
Thereafter, he entered into an agreement with the Dominion of India popularly known as "The 