# Extracting Outcomes of Decisions

- The following code will extract the outcomes of decisions from the processed text. 
- This involves identifying and isolating specific sections of the text that pertain to the decisions and their outcomes.

In [3]:
import pandas as pd
import regex as re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv(r'D:\Proyectos\db_comp\corpus.csv')

unique_values_count = df['Filename'].nunique()

print("Number of cases:", unique_values_count)

Number of cases: 716


In [3]:
# For the purpose of this project, we will work only with case law written in English.

from langdetect import detect, LangDetectException

# Assuming df is already loaded and Text column is present
df['Text'] = df['Text'].fillna('')

# Function to detect if the text is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Apply the language detection function and classify as 'si' for English, 'no' otherwise
df['Classification'] = df['Text'].apply(lambda x: 'si' if is_english(x) else 'no')

In [4]:
en_case_law = df[df['Classification'] == 'si']
unique_values_count = en_case_law['Filename'].nunique()

print("Number of cases:", unique_values_count)

Number of cases: 677


In [5]:
en_case_law.drop('Classification', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_case_law.drop('Classification', axis=1, inplace=True)


In [6]:
# Keeping the rows that potentially contain outcome of the case law

pattern1 = r'D\s*E\s*C\s*I\s*S\s*I\s*O\s*N\s*:\s*'

# Use regex to check if either pattern appears in each row of the 'text' column
# Assign 'si' if either pattern is found, otherwise 'no'
en_case_law['Classification'] = en_case_law['Text'].apply(lambda x: 'si' if re.search(pattern1, x, flags=re.IGNORECASE) else 'no')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  en_case_law['Classification'] = en_case_law['Text'].apply(lambda x: 'si' if re.search(pattern1, x, flags=re.IGNORECASE) else 'no')


In [7]:
indices_to_keep = []

# Iterate over each unique filename
for filename in en_case_law['Filename'].unique():
    # Find the first occurrence of 'si' for the current filename
    first_si_index = en_case_law[(en_case_law['Filename'] == filename) & (en_case_law['Classification'] == 'si')].index.min()
    if pd.notnull(first_si_index):
        # Keep all rows after the first 'si' occurrence within the current filename
        indices_to_keep.extend(en_case_law.index[(en_case_law['Filename'] == filename) & (en_case_law.index >= first_si_index)].tolist())

# Filter the DataFrame using the indices to keep
final_decision = en_case_law.loc[indices_to_keep]

unique_values_count = final_decision['Filename'].nunique()

print("Number of unique values:", unique_values_count)

Number of unique values: 536


In [8]:
final_decision = final_decision.groupby('Filename')['Text'].apply(' '.join).reset_index()
final_decision['Text'] = final_decision['Text'].str.replace("\n", " ")

final_decision

Unnamed: 0,Filename,Text
0,1998_football_world_cup.pdf,EN Official Journal of the European Communitie...
1,ab_inbev_beer_trade_restrictions.pdf,EN EN 57 13.2.3. Duration (254) The overal...
2,abg_oil_companies_operating_in_the_netherlands...,NoL117/12 OfficialJournaloftheEuropeanCommunit...
3,abi.pdf,13.2.87 OfficialJournaloftheEuropeanCommunitie...
4,adalat.pdf,NoL201/58fENl OfficialJournaloftheEuropeanComm...
...,...,...
531,zanussi.pdf,NoL322/40 OfficialJournaloftheEuropeanCommunit...
532,zenimax.pdf,EN 42 EN 13.4.7. Reduction of the fine in vi...
533,zeramontedison_and_ihinkensstähler.pdf,4.11.93 OfficialJournaloftheEuropeanCommunitie...
534,zinc_phosphate.pdf,"(359) On the basis of the above, the Commissio..."


In [9]:
cell_value = final_decision.at[1, 'Text']
cell_value

"EN EN 57  13.2.3.  Duration   (254) The overall duration of the infringement, as set out in Section 9, amounts to 2822  days. Therefore, for the purpose of the calculation of the fine, the amount determined  in Recital (251) and (253), should be multiplied by 7.72 to take account of the  duration of the  infringement.   13.2.4.  Aggravating and mitigating  circumstances   (255) The Commission considers  that no aggravating or mitigating circumstances apply in  this case.  13.2.5.  Deterrence  multiplier   (256) Point 30 of the Guidelines on Fines provides for the possibility of increasing the fine  to ensure that fines have a sufficiently deterrent effect in the case of undertakings  which have a particularly large turnover beyond the sales of goods and services to  which the infringement  relates.   (257) Given that the value of sales to be taken into account in this case amounts to less than  [0-1%] of the total turnover generated by AB InBev during the 2018 financial year  (in othe

In [10]:
non_final_decisions = en_case_law[~en_case_law['Filename'].isin(final_decision['Filename'])]
non_final_decisions = non_final_decisions.groupby('Filename')['Text'].apply(' '.join).reset_index()
non_final_decisions['Text'] = non_final_decisions['Text'].str.replace("\n", " ")

In [11]:
final_decision.to_csv(r'D:\Proyectos\db_comp\en_final_decisions.csv')
non_final_decisions.to_csv(r'D:\Proyectos\db_comp\en_non_final_decisions.csv')

In [4]:
final_decision =pd.read_csv(r'D:\Proyectos\db_comp\en_final_decisions.csv')

In [5]:
pattern = r'D\s*E\s*C\s*I\s*S\s*I\s*O\s*N\s*:\s*'

# Function to extract text after pattern
def extract_text_after_pattern(text):
    match = re.search(pattern, text)
    if match:
        return text[match.end():]  # Extract text after the match
    else:
        return text

final_decision['Text'] = final_decision['Text'].apply(lambda x: extract_text_after_pattern(x))
final_decision

Unnamed: 0.1,Unnamed: 0,Filename,Text
0,0,1998_football_world_cup.pdf,Article 1 The Comité français d'organisation d...
1,1,ab_inbev_beer_trade_restrictions.pdf,"Article 1 Anheuser -Busch InBev NV/SA, InBev..."
2,2,abg_oil_companies_operating_in_the_netherlands...,Article 1 Thecontributions madebytheotheroilco...
3,3,abi.pdf,"Article1 Onthebasisofthefactsinitspossession,t..."
4,4,adalat.pdf,Article 1 Theprohibition ontheexportation toot...
...,...,...,...
531,531,zanussi.pdf,"Article1 Ontheinformationatitsdisposal,theComm..."
532,532,zenimax.pdf,"Article 1 ZeniMax Media Inc., ZeniMax Europ..."
533,533,zeramontedison_and_ihinkensstähler.pdf,"Article1 FarmoplantSpA,Milan(subsequentlyAgrim..."
534,534,zinc_phosphate.pdf,Article 1 Britannia Alloys & Chemicals Limited...


In [6]:
# First Article

# Define patterns
pattern1 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*1\s*'
pattern2 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*2\s*'

# Extract text using regex
def extract_text(row):
    match1 = re.search(pattern1, row['Text'], flags=re.IGNORECASE)
    match2 = re.search(pattern2, row['Text'], flags=re.IGNORECASE)

    if match1:
        start_index = match1.end()
    else:
        return ""

    if match2:
        end_index = match2.start()
    else:
        end_index = len(row['Text'])

    return row['Text'][start_index:end_index]

final_decision['Article 1'] = final_decision.apply(extract_text, axis=1)

In [7]:
# Second Article

# Define patterns
pattern1 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*2\s*'
pattern2 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*3\s*'

# Extract text using regex
def extract_text(row):
    match1 = re.search(pattern1, row['Text'], flags=re.IGNORECASE)
    match2 = re.search(pattern2, row['Text'], flags=re.IGNORECASE)

    if match1:
        start_index = match1.end()
    else:
        return ""

    if match2:
        end_index = match2.start()
    else:
        end_index = len(row['Text'])

    return row['Text'][start_index:end_index]

final_decision['Article 2'] = final_decision.apply(extract_text, axis=1)

In [8]:
# Third Article

# Define patterns
pattern1 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*3\s*'
pattern2 = r'A\s*R\s*T\s*I\s*C\s*L\s*E\s*4\s*'

# Extract text using regex
def extract_text(row):
    match1 = re.search(pattern1, row['Text'], flags=re.IGNORECASE)
    match2 = re.search(pattern2, row['Text'], flags=re.IGNORECASE)

    if match1:
        start_index = match1.end()
    else:
        return ""

    if match2:
        end_index = match2.start()
    else:
        end_index = len(row['Text'])

    return row['Text'][start_index:end_index]

final_decision['Article 3'] = final_decision.apply(extract_text, axis=1)

In [10]:
final_decision.to_csv(r'D:\Proyectos\db_comp\en_outcomes.csv', index = False)