# REGEX

In [1]:
import warnings
warnings.filterwarnings("ignore")
import re
import pandas as pd
import Levenshtein
import pyarrow.parquet as pq
import datasets
from difflib import SequenceMatcher

## SupremeCourtOfIsrael Dataset

In [3]:
# Hugging Face - SupremeCourtOfIsrael (7 Min)

# Load the dataset package
SupremeCourtOfIsrael = datasets.load_dataset('LevMuchnik/SupremeCourtOfIsrael')

# Convert the datasets to Dataframe
Hugging_Face_df = pd.DataFrame.from_dict(SupremeCourtOfIsrael['train'])

OR

In [2]:
# Load the dataset package locally (30 Sec)
Hugging_Face_df = pq.read_table(source='./SupremeCourtOfIsrael/cases_all.parquet').to_pandas()

In [3]:
# Remove rows with empty "text" column
Hugging_Face_df = Hugging_Face_df[Hugging_Face_df['text'].isna() == False]
print("Len:", len(Hugging_Face_df))

# Replace newline characters with space
Hugging_Face_df['text']= Hugging_Face_df['text'].apply(lambda text: text.replace("\n", " "))

Len: 750841


In [4]:
Hugging_Face_df[['Id', 'text']]

Unnamed: 0,Id,text
0,1,בבית המשפט העליון ...
1,159588,בבית המשפט העל...
2,160618,בבית המשפט העליו...
3,168038,בבית המשפט העל...
4,168411,בבית המשפט העליו...
...,...,...
751189,743252,בבית המשפט העליו...
751190,743254,בבית המשפט העליון ...
751191,743242,בבית המשפט העליון ...
751192,743258,בבית המשפט העליון ...


## Legal Clauses

In [5]:
# Define the list of Basic Laws
basic_laws = [
    'חוק-יסוד הכנסת',
    'חוק-יסוד מקרקעי ישראל',
    'חוק-יסוד נשיא המדינה',
    'חוק-יסוד משק המדינה',
    'חוק-יסוד הצבא',
    'חוק-יסוד ירושלים בירת ישראל',
    'חוק-יסוד השפיטה',
    'חוק-יסוד מבקר המדינה',
    'חוק-יסוד כבוד האדם וחירותו',
    'חוק-יסוד חופש העיסוק',
    'חוק-יסוד הממשלה',
    'חוק-יסוד משאל עם',
    'חוק-יסוד ישראל מדינת הלאום של העם היהודי'
]

# Prepare the regex pattern for Basic Laws
formats = [re.escape(law.split(' ')[1]) for law in basic_laws]  # Get the part after 'חוק-יסוד'
basic_laws_pattern = r'\bחוק-יסוד\s*[-: ]?\s*(?:' + '|'.join(formats) + r')\b'

# Define regex of legal clauses
legal_clauses_pattern = r'(?:תקנה|תקנות|סעיף|חוק|הלכה|פסיקה|פסיקת|צו|פקודה|פקודת|כלל|כללי|כללים|הוראה|הוראות)\s[א-ת\s–"\',()\[\]\-]*?(?:-|:|\s)?(?:ת[א-ת]{1,2}\s?\d{4}|\d{4})'

# Combine the patterns into one pattern that will match both legal clauses and basic Laws
combined_pattern = rf'{legal_clauses_pattern}|{basic_laws_pattern}'

In [6]:
def find_legal_text(text):
    # Find all legal text matches using the compiled combined pattern
    legal_matches = re.findall(combined_pattern, text, re.VERBOSE)
    
    # Filter out matches longer than 30 words
    legal_matches = [match.strip() for match in legal_matches if len(match.split()) <= 30]
    
    # Remove duplicates while preserving order
    seen = set()
    unique_matches = []
    for match in legal_matches:
        if all(Levenshtein.distance(match, existing) > 2 for existing in seen):
            unique_matches.append(match)
            seen.add(match)
    
    return unique_matches

In [7]:
df = Hugging_Face_df[['Id', 'text']]

In [8]:
# Apply find_legal_text function to the 'text' column and store the result in a new column (4 minutes)
Hugging_Face_df['Legal_Clauses_Found'] = Hugging_Face_df['text'].apply(find_legal_text)

In [9]:
Hugging_Face_df[['Id', 'text', 'Legal_Clauses_Found']]

Unnamed: 0,Id,text,Legal_Clauses_Found
0,1,בבית המשפט העליון ...,[]
1,159588,בבית המשפט העל...,[]
2,160618,בבית המשפט העליו...,[]
3,168038,בבית המשפט העל...,[]
4,168411,בבית המשפט העליו...,[]
...,...,...,...
751189,743252,בבית המשפט העליו...,"[חוק סדר הדין הפלילי (סמכויות אכיפה – מעצרים),..."
751190,743254,בבית המשפט העליון ...,[חוק סדר הדין הפלילי (סמכויות אכיפה – מעצרים...
751191,743242,בבית המשפט העליון ...,[חוק סדר הדין הפלילי (סמכויות אכיפה – מעצרים...
751192,743258,בבית המשפט העליון ...,"[חוק סדר הדין הפלילי (סמכויות אכיפה – מעצרים),..."


## Precedents

In [10]:
# Define initial letters that might precede the prefixes
initial_letters = r'(?:\b(?:ב|וב|ה|וה)\b)?'  # Optional initial letters

additional_prefixes = [
    "אב\"ע", "א\"ת", "את\"פ", "אמ\"ץ", "פ\"ר", "אפ\"ח", "א\"פ", "ב\"ל", "וח\"ק",
    "בק\"מ", "ת\"ת", "ביד\"מ", "בדמ\"ש", "בע\"ק", "בפ\"מ", "עה\"פ", "בה\"ן", "בה\"פ",
    "בפ\"ת", "בש\"ע", "בת\"ת", "בב\"נ", "בע\"א", "בר\"ש", "בר\"ע", "שב\"ד",
    "שנ\"א", "גמ\"ר", "דמ\"ר", "דמ\"ש", "דנ\"א", "דנג\"ץ", "דנ\"מ", "דנ\"פ",
    "ד\"ט", "הס\"ת", "המ\"ע", "ה\"כ", "ה\"ת", "ה\"ט", "ה\"נ", "ה\"פ", "הפ\"ב",
    "הד\"פ", "ה\"ד", "ת\"ט", "תה\"ן", "ו\"ע", "ח\"א", "חב\"ר", "חע\"מ", "חע\"ק",
    "ח\"ד", "ח\"נ", "חס\"מ", "י\"ס", "כ\"צ", "מק\"מ", "מ\"י", "מי\"ב", "מ\"מ",
    "מ\"ת", "מ\"ח", "נע\"ד", "ס\"ע", "ס\"ק", "סק\"כ", "פ\"ל", "עמ\"א", "ע\"א",
    "עב\"ז", "ע\"ב", "עב\"ל", "עח\"ר", "ע\"נ", "ער\"מ", "עמ\"ח", "על\"ע", "עמ\"נ",
    "ע\"מ", "עמ\"מ", "עש\"מ", "עמ\"ש", "ענ\"א", "ענ\"פ", "ענמ\"ש", "עס\"ק", "ע\"ע",
    "עב\"י", "עמל\"ע", "עמש\"מ", "עמר\"מ", "ער\"פ", "ע\"ר", "עמ\"פ", "עש\"ר", "ע\"ו",
    "על\"ח", "עק\"נ", "עק\"פ", "עע\"מ", "עעת\"א", "ע\"פ", "עפ\"א", "עפ\"ג", "עפ\"ר",
    "עפ\"ת", "עפס\"פ", "עפ\"ס", "עש\"א", "ע\"ש", "עש\"ת", "ע\"ח", "עב\"פ", "עא\"פ",
    "עח\"ע", "עע\"ר", "עפ\"ע", "עה\"ג", "עמ\"י", "עמ\"ת", "עכ\"ב", "עק\"מ", "עח\"ק",
    "עפ\"מ", "עפ\"ן", "בג\"ץ", "עג\"ר", "עת\"מ", "עת\"א", "פק\"ח", "פר\"ק", "פ\"ה",
    "פש\"ר", "צ\"א", "צ\"ה", "צ\"ח", "מ\"כ", "צ\"ו", "ק\"פ", "ק\"ג", "רע\"ס", "רע\"א",
    "רע\"מ", "רמ\"ש", "רצ\"פ", "רע\"צ", "רע\"ו", "רע\"ב", "רעת\"א", "רע\"פ", "רע\"ש",
    "רת\"ק", "ש\"ש", "ש", "ש\"ע", "ת\"ד", "נ\"ב", "תמ\"ק", "תמ\"ר", "תנ\"ג", "ת\"ק",
    "ת\"ב", "סב\"א", "גז\"ז", "ח\"ש", "תג\"א", "ת\"ח", "תנ\"ז", "תע\"א", "ת\"צ", "ת\"מ",
    "תא\"מ", "תא\"ח", "תא\"ק", "ת\"א", "תה\"ג", "תה\"ס", "ע\"ל", "תל\"א", "תל\"ב",
    "תל\"פ", "תמ\"ש", "ת\"ע", "ת\"פ", "תפ\"ח", "ת\"ג", "תת\"ח", "תת\"ע", "תו\"ח", "תו\"ב",
    "המ\"ש", "הע\"ז", "ש\"מ", "שע\"מ", "בש\"א", "ר\"ע", "ראו", "למשל", "בת.פ.", "ת.א."
]

# Join the prefixes into a regex pattern, ensuring word boundaries
prefix_pattern = r'\b(?:' + '|'.join(re.escape(prefix) for prefix in additional_prefixes) + r')\b'

# Pattern for numbers (1-8 digits), joined or separated by dashes or slashes
number_pattern = r'\b\d{1,8}(?:[-/]\d{1,8})*\b'

# Combine the patterns
full_pattern = initial_letters + prefix_pattern + r'[ ,.:;!?]*' + number_pattern

In [11]:
# Find legal precedents within the "text" column
def find_legal_precedents(text):
    pattern = full_pattern

    # Replace newline characters with space
    text = text.replace("\n", " ")

    precedents_matches = re.findall(pattern, text)
    
    # Filter matches to remove those with length exceeding too much characters
    filtered_matches = [match for match in precedents_matches if len(match) <= 25]

    # Remove duplicate legal clauses
    seen = set()
    unique_precedents_matches = []
    
    for match in filtered_matches:
        # Remove all spaces and compare
        cleaned_match = re.sub(r'\s+', '', match)
        if cleaned_match not in seen:
            seen.add(cleaned_match)
            unique_precedents_matches.append(match)
    
    return unique_precedents_matches

In [12]:
# Apply find_legal_precedents function to the 'text' column and store the result in a new column (9 min)
Hugging_Face_df['Precedents_Found'] = Hugging_Face_df['text'].apply(find_legal_precedents)

In [13]:
Hugging_Face_df[Hugging_Face_df['Precedents_Found'].apply(len) > 0].sort_values(by=['Id'])[['Id', 'Precedents_Found']]

Unnamed: 0,Id,Precedents_Found
0,1,"[בג""ץ 5856/03]"
11,3,"[ע""פ 7470/00]"
14,4,"[ע""א 3906/99]"
21,5,"[ע""א 4950/01]"
42,7,"[בש""א 10273/01]"
...,...,...
749889,743265,"[בג""ץ 5321/22]"
749950,743266,"[בג""ץ 5358/22]"
749958,743267,"[בג""ץ 5360/22]"
750957,743268,"[ע""א 6053/22]"


## Filter Hugging_Face_df

legal clauses

In [23]:
# Save only rows with legal clauses found in the "text" column
filtered_df = Hugging_Face_df[Hugging_Face_df['Legal_Clauses_Found'].apply(len) > 0]

In [30]:
# Save filtered_df to parquet file
parquet = filtered_df[['Id', 'text', 'Legal_Clauses_Found']]
parquet.to_parquet('Hugging_Face_df_Legal_Clauses.parquet', engine='pyarrow', compression='snappy')

precedents

In [12]:
# Save only rows with precedents found in the "text" column
filtered_df = Hugging_Face_df[Hugging_Face_df['Precedents_Found'].apply(len) > 0]

In [14]:
# Save filtered_df to parquet file
parquet = filtered_df[['Id', 'text', 'Precedents_Found']]
parquet.to_parquet('Hugging_Face_df_Precedents.parquet', engine='pyarrow', compression='snappy')

## Comparison

### Manually tagged documents

In [16]:
# Import Manually tagged documents data
manual_tag_df = pd.read_excel('./Fine Tuning Utilities/Manual Document Tagging.xlsx').sort_values(by=['Id'])
manual_tag_df

Unnamed: 0,Id,CaseDesc,Pages,Year,legal_clauses,precedents
0,27,,11,1993,[בסעיף 11 לתוספת לחוק מס קניה (סחורות ושירותים...,"[ע""א 2512/93, ה""פ 652/90, בה""פ 652/96, ר""ע 483..."
1,28,"רע""פ 2996/13",16,2013,"[חוק העונשין (תיקון מס\' 119) (עבירות המתה), ת...","[רע""פ 2996/13, רע""פ 4845/13, רע""פ 6926/13, גמ""..."
2,43,"בג""ץ 3254/96",10,1996,"[תקנה 122 לתקנות סדר הדין האזרחי, התשמ""ד1984]","[רע""א 3254/96, רע""א 7329/96, ע""א 2271/90]"
3,58,"בג""ץ 5379/95",14,1995,"[סעיף 33 לחוק המכר תשכ""ח1968], [סעיף 34 לחוק ה...","[רע""א 95, בע""א 606/93, ת""ק 4469, בע""א 782/86, ..."
4,111,"בג""ץ 3792/95",29,1995,"[סעיף 3א לחוק יסודות התקציב, התשמ""ה 1985], [סע...","[בג""ץ 3792/95, בג""ץ 59/88, בג""ץ 847/94, בג""ץ 3..."
...,...,...,...,...,...,...
247,733933,"ע""פ 1810/22",14,2022,"[סעיף 199(א)(2) לחוק העונשין, התשל""ז-1977], [ח...","[ע""פ 1810/22, ע""פ 309/22, ע""פ 1187/22, ע""פ 274..."
248,734750,"ע""פ 6168/20",22,2020,"[סעיף 329(א)(1)-(2) לחוק העונשין, התשל""ז-1977]...","[ע""פ 6168/20, ת""פ 23020-01-16, ע""פ 951/80, ע""פ..."
249,734803,"ע""א 2733/19",22,2019,"[סעיף 3(ב) לחוק החוזים (חלק כללי), התשל""ג-1973...","[ע""א 2733/19, ע""א 4933/17, ע""א 5927/98, עע""מ 7..."
250,738846,"ע""א 1479/18",40,2018,"[תקנה 69 לתקנות ההוצאה לפועל, התש""ם-1979], [סע...","[ע""א 1479/18, ת""א 38141/08, בע""א 6894/15, בע""א..."


Extract the legal clauses from manual_tag_df

In [17]:
# Regex pattern to match entire brackets including nested ones
pattern = r'\[[^\[\]]*(?:\[[^\[\]]*\])*[^\[\]]*\]'

# Function to split clauses while preserving nested brackets
def split_clauses(clause_str):
    # Find all matches based on the pattern
    matches = re.findall(pattern, clause_str)
    return matches

# Apply the function to the column
manual_tag_df['legal_clauses'] = manual_tag_df['legal_clauses'].apply(split_clauses)
manual_tag_df['precedents'] = manual_tag_df['precedents'].apply(split_clauses)

# Function to remove leading '[' and trailing ']'
def clean_clause(clause):
    return clause[1:-1] if clause.startswith('[') and clause.endswith(']') else clause

# Apply the cleaning function to each item in the lists
manual_tag_df['legal_clauses'] = manual_tag_df['legal_clauses'].apply(lambda lst: [clean_clause(item) for item in lst])
manual_tag_df['precedents'] = manual_tag_df['precedents'].apply(lambda lst: [clean_clause(item) for item in lst])


# Function to split string inside the list by comma (only for precedents)
def split_list_of_precedents(precedents_list):
    if precedents_list:
        # Split the first string in the list by commas
        return [item.strip() for item in precedents_list[0].split(',')]
    return precedents_list

# Apply the function to split the string in the list
manual_tag_df['precedents'] = manual_tag_df['precedents'].apply(split_list_of_precedents)

In [18]:
# Filter Hugging_Face_df by the id of the manually tagged documents
filtered_Hugging_Face_df = Hugging_Face_df[Hugging_Face_df['Id'].isin(manual_tag_df['Id'])][['Id', 'text']]

In [19]:
# Merge manual_tag_df with filtered_Hugging_Face_df by 'Id'
merged_df = pd.merge(manual_tag_df, Hugging_Face_df, on='Id')

### Legal_Clauses

In [20]:
# Function to calculate similarity ratio
def is_similar(clause1, clause2, threshold=0.7):
    ratio = SequenceMatcher(None, clause1, clause2).ratio()
    return ratio >= threshold

# Function to compare and calculate the match percentage
def calculate_match(row):
    # Remove spaces from clauses
    manual_clauses = [clause.replace(" ", "") for clause in row['legal_clauses']]
    regex_clauses = [clause.replace(" ", "") for clause in row['Legal_Clauses_Found']]

    # Find common clauses with at least 90% similarity
    common_clauses = set()
    for manual_clause in manual_clauses:
        for regex_clause in regex_clauses:
            if is_similar(manual_clause, regex_clause):
                common_clauses.add(manual_clause)
                break
    
    # Calculate match percentage
    if len(manual_clauses) > 0:
        match_percentage = len(common_clauses) / len(manual_clauses) * 100
    else:
        match_percentage = 0
    
    return match_percentage

In [21]:
# Apply the function to each row
merged_df['Match_Percentage'] = merged_df.apply(calculate_match, axis=1)

# Calculate the overall average of Match_Percentage
overall_average = merged_df['Match_Percentage'].mean()

# Output the results
print(f"Overall Average Match Percentage on manually tagged data: {overall_average:.2f}%")
merged_df[['Id', 'Match_Percentage']].sort_values(by=['Match_Percentage'])

Overall Average Match Percentage on manually tagged data: 93.24%


Unnamed: 0,Id,Match_Percentage
110,401233,0.0
157,481444,0.0
203,594854,0.0
45,206071,0.0
199,571940,50.0
...,...,...
103,382044,100.0
104,386931,100.0
105,393867,100.0
139,447764,100.0


### Precedents

In [22]:
# Function to calculate similarity ratio
def is_similar(clause1, clause2, threshold=0.7):
    ratio = SequenceMatcher(None, clause1, clause2).ratio()
    return ratio >= threshold

# Function to compare and calculate the match percentage
def calculate_match(row):
    # Remove spaces from clauses
    manual_clauses = [clause.replace(" ", "") for clause in row['precedents']]
    regex_clauses = [clause.replace(" ", "") for clause in row['Precedents_Found']]

    # Find common clauses with at least 90% similarity
    common_clauses = set()
    for manual_clause in manual_clauses:
        for regex_clause in regex_clauses:
            if is_similar(manual_clause, regex_clause):
                common_clauses.add(manual_clause)
                break
    
    # Calculate match percentage
    if len(manual_clauses) > 0:
        match_percentage = len(common_clauses) / len(manual_clauses) * 100
    else:
        match_percentage = 0
    
    return match_percentage

In [23]:
# Apply the function to each row
merged_df['Match_Percentage'] = merged_df.apply(calculate_match, axis=1)

# Calculate the overall average of Match_Percentage
overall_average = merged_df['Match_Percentage'].mean()

# Output the results
print(f"Overall Average Match Percentage on manually tagged data: {overall_average:.2f}%")
merged_df[['Id', 'Match_Percentage']].sort_values(by=['Match_Percentage'])

Overall Average Match Percentage on manually tagged data: 95.24%


Unnamed: 0,Id,Match_Percentage
161,490877,0.0
186,514492,0.0
70,303375,0.0
184,512627,0.0
58,259639,0.0
...,...,...
92,363152,100.0
93,363886,100.0
94,369338,100.0
96,373618,100.0
