In [20]:
import pandas as pd
df_sdc= pd.read_csv(r"D:\Proyectos\winter_camp\sdc_raw_2009_2024.csv")

# El expediente es un código alfanumérico que se utiliza para identificar un caso/procedimiento. 
# El término 'ccd' se refiere a aquellos procedimientos relativos a la Represión de la Competencia Desleal

df_sdc = df_sdc[df_sdc['Expediente'].str.contains('ccd', case=False, na=False)]
df_sdc['Expediente'] = df_sdc['Expediente'].str.upper()
df_sdc['Número de Resolución'] = df_sdc['Número de Resolución'].str.replace('/', '-')
df_sdc.loc[:,'Fecha de Resolución'] = pd.to_datetime(df_sdc['Fecha de Resolución'])
# Contemos cuántas resoluciones de la Sala de Defensa de la Competencia hay en el dataset
row_count_df2 = df_sdc.shape[0]
print(f'Número de resoluciones de la Sala de Defensa de la Competencia: {row_count_df2}')
df_sdc

Número de resoluciones de la Sala de Defensa de la Competencia: 1350


Unnamed: 0,Número de Resolución,Fecha de Resolución,Expediente,Partes,Sumilla,Enlace
8,1-2019-SDC,2019-01-08 00:00:00,269-2017/CCD,CERRADURAS NACIONALES S.A.C GRUPO FORTE S.A.C.,Se declara la NULIDAD de la Resolución 5 del 1...,http://servicio.indecopi.gob.pe/buscadorResolu...
9,1-2018-SDC,2018-01-10 00:00:00,000001-2016/CCD-INDECOPI-PIU,PROCEDIMIENTO INICIADO DE OFICIO NEO MOTORS S....,Se CONFIRMA la Resolución 377-2017/INDECOPI-PI...,http://servicio.indecopi.gob.pe/buscadorResolu...
10,1-2023-SDC,2023-01-03 00:00:00,1-2022/CCD-INDECOPI-PIU (CUADERNO CAUTELAR),SUE HELLEN BRUN LOZADA HUMBERTO MANUEL FERNAND...,Se declara IMPROCEDENTE el recurso de apelació...,http://servicio.indecopi.gob.pe/buscadorResolu...
21,2-2022-SDC,2022-01-06 00:00:00,261-2020/CCD,PROCEDIMIENTO INICIADO DE OFICIO ERASMO BORIS ...,Se CONFIRMA la Resolución 0118-2021/CCD-INDECO...,http://servicio.indecopi.gob.pe/buscadorResolu...
23,2-2019-SDC,2019-01-08 00:00:00,10-2015/CCD-CUS,SILVIA BANESSA SALAZAR AHUANARI,Se declara improcedente el recurso de reconsid...,http://servicio.indecopi.gob.pe/buscadorResolu...
...,...,...,...,...,...,...
14206,2261-2010-SC1,2010-08-11 00:00:00,10-2008/CCD-INDECOPI-PIU (ACUMULADOS),COMISIÓN DE LA OFICINA REGIONAL DEL INDECOPI D...,Se MODIFICA la Resolución 200-2010/INDECOPI-PI...,http://servicio.indecopi.gob.pe/buscadorResolu...
14209,2262-2010-SC1,2010-08-11 00:00:00,34-2009/CCD-INDECOPI-CUS,COMISIÓN DE LA OFICINA REGIONAL DEL INDECOPI D...,Se CONFIRMA la Resolución 025-2010/INDECOPI-CU...,http://servicio.indecopi.gob.pe/buscadorResolu...
14302,2296-2010-SC1,2010-08-16 00:00:00,16-2009/CCD,EMPRESA PERIODÍSTICA NACIONAL S.A. CONTACTA PE...,Se CONFIRMA la Resolución 212-2009/CCD-INDECOP...,http://servicio.indecopi.gob.pe/buscadorResolu...
14348,2313-2010-SC1,2010-08-17 00:00:00,39-2009/CCD-INDECOPI-CUS,PROCEDIMIENTO DE OFICIO OPERACIONES ARCOS DORA...,Se MODIFICA la Resolución 028-2010/INDECOPI-CU...,http://servicio.indecopi.gob.pe/buscadorResolu...


**1) Creación del corpus**

1. Descargar los PDFs
2. Extraer el texto de los PDFs

In [21]:
import os
import requests
from requests.exceptions import ChunkedEncodingError

# Create a directory to save the downloaded files
download_dir = 'D:/Proyectos/winter_camp/decisiones_sdc'
os.makedirs(download_dir, exist_ok=True)

# Create an empty list to log failed downloads
failed_downloads = []

# Loop through the DataFrame and download files
for index, row in df_sdc.iterrows():
    name = row['Número de Resolución']
    link = row['Enlace']
    
    try:
        response = requests.get(link, stream=True)
        
        if response.status_code == 200:
            file_extension = 'pdf'
            file_name = f"{name}.{file_extension}"
            file_path = os.path.join(download_dir, file_name)
            
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            
            print(f"Downloaded: {file_name}")
        else:
            print(f"Failed to download: {name} from {link} (Status code {response.status_code})")
            failed_downloads.append({'Número de Resolución': name, 'Enlace': link, 'Observation': f'Failed - Status code {response.status_code}'})
    
    except ChunkedEncodingError:
        print(f"Failed to download: {name} from {link} due to ChunkedEncodingError")
        failed_downloads.append({'Número de Resolución': name, 'Enlace': link, 'Observation': 'Failed - ChunkedEncodingError'})
    
    except Exception as e:
        print(f"Failed to download: {name} from {link} due to {e}")
        failed_downloads.append({'Número de Resolución': name, 'Enlace': link, 'Observation': f'Failed - {e}'})

# Convert failed downloads list to DataFrame and save to CSV
failed_downloads_df = pd.DataFrame(failed_downloads)
failed_downloads_df.to_csv('failed_downloads.csv', index=False)

Downloaded: 1-2019-SDC.pdf
Failed to download: 1-2018-SDC from http://servicio.indecopi.gob.pe/buscadorResoluciones/getDoc?docID=workspace://SpacesStore/ef8014c8-afb6-4963-bb16-d91c3d093be0 due to ChunkedEncodingError
Downloaded: 1-2023-SDC.pdf
Downloaded: 2-2022-SDC.pdf
Downloaded: 2-2019-SDC.pdf
Downloaded: 2-2024-SDC.pdf
Downloaded: 3-2019-SDC.pdf
Downloaded: 3-2020-SDC.pdf
Downloaded: 3-2009-SC1.pdf
Downloaded: 3-2024-SDC.pdf
Downloaded: 4-2016-SDC.pdf
Downloaded: 4-2019-SDC.pdf
Downloaded: 4-2022-SDC.pdf
Failed to download: 4-2009-SC1 from http://servicio.indecopi.gob.pe/buscadorResoluciones/getDoc?docID=workspace://SpacesStore/c0453845-cccc-4d0d-babb-966fe9c25eca due to ChunkedEncodingError
Downloaded: 4-2024-SDC.pdf
Downloaded: 5-2016-SDC.pdf
Downloaded: 5-2017-SDC.pdf
Downloaded: 5-2022-SDC.pdf
Downloaded: 5-2009-SC1.pdf
Downloaded: 5-2024-SDC.pdf
Downloaded: 6-2016-SDC.pdf
Downloaded: 6-2019-SDC.pdf
Downloaded: 6-2024-SDC.pdf
Downloaded: 7-2015-SC1.pdf
Downloaded: 7-2020-SDC.p

In [23]:
import fitz
import pytesseract
from PIL import Image
import pandas as pd
from unidecode import unidecode
import pdfrw
import PyPDF2
import os

In [24]:
pytesseract.pytesseract.tesseract_cmd = r"D:\Programas\tesseract\tesseract.exe"

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return doc

def ocr_image(image_path):
    text = pytesseract.image_to_string(Image.open(image_path), lang='spa')

    # Optionally, you can display the image using cv2.imshow() if needed.

    # Cleanup: Remove temporary image file
    try:
        os.remove(image_path)
    except FileNotFoundError:
        pass

    return text

def ocr_pdf(pdf_path):
    doc = extract_images_from_pdf(pdf_path)
    texts = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap()

        image_path = f"temp_image_page_{page_num + 1}.png"

        # Create an image from raw pixmap data
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Save the image
        img.save(image_path)

        text = ocr_image(image_path)
        texts.append(text)

    # Close the PDF document after processing
    doc.close()

    return texts

In [25]:
def text_pdf(pdf_path):
    try:
        # Open the PDF file in binary mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)

            # Get the number of pages in the PDF
            num_pages = len(pdf_reader.pages)

            # Initialize a list to store the extracted text from each page
            text_list = []

            # Loop through all pages and extract text
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text_list.append(page.extract_text())

            return text_list

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [26]:
def has_empty_strings(lst):
    """
    Check if a list contains empty strings.

    Parameters:
    - lst (list): List of strings to check.

    Returns:
    - list: List containing None if input list is None, 
            list containing empty strings or None if any, 
            otherwise the original list.
    """
    if lst is None:
        return [None]
    
    return [s for s in lst if s is None or not s.strip()]

In [27]:
def process_pdfs_in_folder(folder_path):
    data = {'Número de Resolución': [], 'Text': []}

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        texto = text_pdf(pdf_path)
        
        empty_strings_present = has_empty_strings(texto)


    # Perform different actions based on whether there are empty strings
        if empty_strings_present:
        # If empty strings are present, use OCR function
            texts = ocr_pdf(pdf_path)
        else:
        # If no empty strings, process the PDF directly
            texts = text_pdf(pdf_path)

        # Append data to DataFrame
        data['Número de Resolución'].extend([pdf_file] * len(texts))
        data['Text'].extend(texts)

    return pd.DataFrame(data)

In [31]:
download_dir = 'D:\Proyectos\winter_camp\decisiones_sdc'

corpus_sdc = process_pdfs_in_folder(download_dir)
corpus_sdc['Número de Resolución'] = corpus_sdc['Número de Resolución'].str.replace('.pdf', '')
corpus_sdc.to_csv(r'D:\Proyectos\winter_camp\decisiones_sdc\corpus_sdc.csv', index=False)

An error occurred: Could not read Boolean object
An error occurred: EOF marker not found
An error occurred: Could not read Boolean object
An error occurred: Could not read Boolean object


MuPDF error: format error: object out of range (61 0 R); xref size 60
MuPDF error: format error: non-page object in page tree
MuPDF error: format error: object out of range (84 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format error: object out of range (744 0 R); xref size 60
MuPDF error: format err

An error occurred: EOF marker not found


MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF error: format error: object out of range (748 0 R); xref size 60
MuPDF 

An error occurred: EOF marker not found


MuPDF error: format error: object out of range (31 0 R); xref size 31
MuPDF error: format error: object out of range (31 0 R); xref size 31
MuPDF error: format error: object out of range (31 0 R); xref size 31
MuPDF error: format error: object out of range (31 0 R); xref size 31


An error occurred: Could not read Boolean object
An error occurred: Could not read Boolean object
An error occurred: Could not read Boolean object
An error occurred: EOF marker not found


**2) Clasificando la información**

In [40]:
import regex as re
import pandas as pd
from unidecode import unidecode


In [41]:
corpus_sdc = pd.read_csv(r'D:\Proyectos\winter_camp\decisiones_sdc\corpus_sdc.csv')

In [42]:
def clean_text(text):
    if pd.isna(text):
        return ''
    # Convert the text to string to handle non-string inputs
    text = str(text)
    # Normalize text by removing accents
    cleaned_text = unidecode(text)
    # Remove non-alphanumeric characters except spaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
    return cleaned_text

corpus_sdc['Cleaned_Text'] = corpus_sdc['Text'].apply(clean_text)

In [43]:
def find_closest_match(phrases, text):
    pattern = re.compile(f"({'|'.join(re.escape(phrase) for phrase in phrases)})|(\\b\\w+\\b)", flags=re.IGNORECASE)
    matches = re.findall(pattern, text)
    return max(matches, key=lambda x: len(x[0]) if x[0] else 0)[0] if matches else None

def remove_rows_after_phrases(df, phrases):
    indices_to_remove = []
    current_filename = None

    for index, row in df.iterrows():
        matched_phrase = find_closest_match(phrases, row['Cleaned_Text'])
        if matched_phrase:
            current_filename = row['Número de Resolución']
            indices_to_remove.append(index)
        elif current_filename is not None and row['Número de Resolución'] == current_filename:
            indices_to_remove.append(index)
        else:
            current_filename = None

    df_filtered = df.drop(indices_to_remove)
    return df_filtered

# Specify the phrases to search for
phrases_to_search = ["voto singular", "voto en discordia"]

# Call the function to remove rows after the specified phrase
corpus_sdc = remove_rows_after_phrases(corpus_sdc, phrases_to_search)

In [44]:
corpus_sdc

Unnamed: 0,Número de Resolución,Text,Cleaned_Text
0,1-2019-SDC,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
1,1-2019-SDC,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
2,1-2019-SDC,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
3,1-2019-SDC,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
4,1-2019-SDC,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
...,...,...,...
25390,990-2011-SC1,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
25391,990-2011-SC1,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
25392,990-2011-SC1,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...
25393,990-2011-SC1,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...,TRIBUNAL DE DEFENSA DE LA COMPETENCIA \nY DE ...


In [45]:
corpus_sdc = corpus_sdc.drop(columns=['Cleaned_Text'])
corpus_sdc['Text'] = corpus_sdc['Text'].astype(str).fillna('')
corpus_sdc = corpus_sdc.groupby('Número de Resolución')['Text'].apply(' '.join).reset_index()
corpus_sdc['Text'] = corpus_sdc['Text'].str.replace("\n", " ")

In [46]:
def extract_last_n_words(text, n=800):
    # Verifica si el texto es una cadena
    if isinstance(text, str):
        # Divide el texto en palabras
        words = text.split()
        # Extrae las últimas n palabras
        last_n_words = words[-n:]
        # Une las palabras en un solo string
        return ' '.join(last_n_words)
    else:
        return ''  # Retorna una cadena vacía si el texto no es una cadena


# Aplicar la función a la columna del DataFrame
corpus_sdc['Sección Resolutiva'] = corpus_sdc['Text'].apply(lambda x: extract_last_n_words(x, 800))

In [47]:
corpus_sdc = corpus_sdc.drop(columns=['Text'])


In [52]:
corpus_sdc.to_csv(r'D:\Proyectos\winter_camp\decisiones_sdc\corpus_sdc_por_clasificar.csv', index=False)