In [2]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [21]:
# Reading the procedural parts dataset 

df_parties = pd.read_csv(r'C:\Users\PC\Daniel Chen Dropbox\Alexis Malca\Peru_Justice\02_Data\08_CEJ_Web\data_cleaned\DF_procedural_parts_2020.csv')

In [22]:
# Keeping only the parties that are legal entities. This classification was done by the former guys
juridica_df = df_parties[df_parties['Tipo dePersona'] == 'JURIDICA']

# Dropping unnecesary columns

juridica_df = juridica_df.drop(columns=['Apellido Paterno /Razón Social', 'Tipo dePersona', 'Nombres']).reset_index(drop=True)
juridica_df


Unnamed: 0,Expediente N°:,Parte,ApellidoMaterno
0,00001-2020-0-0102-JP-CI-01,DEMANDADO,\t\t\t\t\t\t\t\t\t\t\t\t\t\tEMPRESA CONSTRUCCI...
1,00002-2020-0-0107-JP-CI-01,DEMANDANTE,\t\t\t\t\t\t\t\t\t\t\t\t\t\tCAJA MUNICIPAL DE ...
2,00002-2020-17-0107-JP-CI-01,DEMANDANTE,\t\t\t\t\t\t\t\t\t\t\t\t\t\tCAJA MUNICIPAL DE ...
3,00003-2020-0-0107-JP-CI-01,DEMANDANTE,\t\t\t\t\t\t\t\t\t\t\t\t\t\tCAJA MUNICIPAL DE ...
4,00004-2020-0-0102-JP-CI-01,DEMANDANTE,\t\t\t\t\t\t\t\t\t\t\t\t\t\tCOOPERATIVA DE AHO...
...,...,...,...
76970,00001-2020-0-3301-JR-LA-01,DEMANDADO,\t\t\t\t\t\t\t\t\t\t\t\t\t\tMUNICIPALIDAD DE V...
76971,00001-2020-0-3301-JR-LA-01,DEMANDADO,\t\t\t\t\t\t\t\t\t\t\t\t\t\tPROCURADOR DE LA M...
76972,00002-2020-0-3301-JR-LA-01,DEMANDADO,\t\t\t\t\t\t\t\t\t\t\t\t\t\tPESQUERA TECNOLOGI...
76973,00003-2020-0-3301-JR-LA-01,DEMANDADO,\t\t\t\t\t\t\t\t\t\t\t\t\t\tMUNICIPALIDAD DIST...


In [23]:
# The name of the legal entity was stored in the column "ApellidoMaterno", which means "Last name from mom" in Spanish. 
# This could be due a mistake in the original dataset, but there is no problem.
# We have to clean them

juridica_df['ApellidoMaterno'] = juridica_df['ApellidoMaterno'].str.replace('\t', '')


In [7]:
# This is only useful with the 2022 csv 
# Then we have to keep only the case file ID from the column "Expediente N°:" 

pattern = r'\\([^\\]+)$'

juridica_df['File_ID'] = juridica_df['Expediente N°:'].str.extract(pattern)

# Now we can drop the column 

juridica_df = juridica_df.drop(columns=['Expediente N°:'])

In [24]:
# THERE ARE PARTIES THAT WERE PREVIOSLY IDENTIFIED, WE WILL EXCLUDE THEM SINCE THEY ARE STATE INSTITUIONS (E.G. PROSECUTORS)
observations_to_exclude = ["MINISTERIO PUBLICO", "PROCURADOR PUBLICO", "FISCALIA", 'CENTRO EMERGENCIA MUJER', 'BENEFICENCIA PUBLICA']

mask_no_exclusions = ~juridica_df['Parte'].str.upper().isin(observations_to_exclude)

juridica_df = juridica_df[mask_no_exclusions].copy()

In [25]:
juridica_df['Parte'].value_counts()

Parte
DEMANDADO                               42555
DEMANDANTE                              25627
TERCERO                                  1379
SOLICITANTE                               756
ABOGADO                                   517
REPRESENTANTE                             476
CITACION                                  301
EMPLAZADO                                 235
INSTITUCIONES DE PROTECCION AL MENOR      224
APODERADO                                 150
AGRAVIADO                                  98
PERITO                                     96
QUEJADO                                    91
DENUNCIANTE                                48
DENUNCIADO                                 42
LITIS CONSORTE                             39
QUEJOSO                                    37
MENOR                                      36
REPRESENTANTE LEGAL                        18
POR DEFINIR                                17
CURADOR                                    12
MARTILLERO                  

In [133]:
column_name = "ApellidoMaterno"

# List of keywords
keywords = ["municipalidad", "SAC", "S.A.C.", 'S A C', "S.A.A.", "SAA", "S.A.", 'S A', 'SA', 'E.I.R.L.', 'EIRL', 'SRL', 'S.R.L.',
            "poder judicial", "ministerio", "BENEFICICIENCIA", 'PROCURADOR', 'DISTRITAL', 'PROVINCIAL', 'DERRAMA', 'JUZGADO', 'REGISTRO NACIONAL',
            "sociedad anonima", 'gobierno regional', "AFP", 'fideicomiso', 'COMISARIA', 'SINDICATO', 'BANCO DE CREDITO DEL PERU',
            'RED DE SALUD', 'CAJA MUNICIPAL', 'SUPERINTENDENCIA', 'COOPERATIVA', 'BANCO DE LA NACION', 'CONSORCIO', 'COMUNIDAD CAMPESINA',
            'CEMENTERIO', 'ASOCIACION', 'EMPRESA INDIVIDUAL', 'BANCO BBVA PERU', 'BANCO INTERAMERICANO DE FINANZAS', 'PARROQUIA', 'JUZGADO',
            'TEODORO', 'EMERGENCIA', 'FISCAL', 'BENEFICENCIA', 'SUCESION', 'SUCESORES', 'DEFENSOR PUBLICO', 'UGEL', 'REGIONAL', 'JUSTICIA',
            'PREVENC', 'EJECUTORA', 'FPCEDVCMEIGFPP', 'CEM', 'UNIDAD', 'ONP', 'PNP', 'BENEFICIENCIA', 'POLICIA', 'HOSPITAL', 'RENIEC', 'GOBIERNO',
            'NACIONAL', 'EJERCITO', 'BBVA', 'INTERBANK', 'PROPIETARIOS']

selected_rows = []


# Iterate over rows in the specified column
for index, value in juridica_df[column_name].items():
    try:
        # Check if the row value contains any keyword
        if not any(keyword.lower() in str(value).lower() for keyword in keywords):
            # Display the result for the current row
            selected_rows.append(juridica_df.loc[index].to_dict())

    except TypeError as e:
        print(f"An error occurred for row {index} in column '{column_name}': {e}")
        # Handle the error as needed or just continue to the next row

    except Exception as e:
        print(f"An unexpected error occurred for row {index} in column '{column_name}': {e}")
        # Handle the error as needed or just continue to the next row

selected_rows_df = pd.DataFrame(selected_rows)


In [134]:
selected_rows_df

Unnamed: 0,Expediente N°:,Parte,ApellidoMaterno
0,00006-2020-0-0107-JP-CI-02,DEMANDADO,AUGUSTO CUBAS VALDEZ
1,00038-2020-0-0102-JP-FC-01,SOLICITANTE,MINI CENTRAL DE NOTIFICACIONES DE TAMBOGRANDE ...
2,00002-2020-0-0102-JP-LA-01,DEMANDADO,NEGOCIOS OLTI EMP IND DE RESP LTADA
3,00002-2020-0-0107-JP-LA-02,DEMANDADO,ESTACION DE SERVICIOS JAVIMARK
4,00005-2020-0-0106-JP-LA-01,DEMANDADO,ANB TELECOMUNICACIONES
...,...,...,...
4949,00002-2020-0-3301-JR-CI-01,DEMANDADO,JENNYFER CONTRERAS MIRANDA
4950,00002-2020-0-3301-JR-CI-01,DEMANDADO,JHAIRO EVAIR CONTRERAS MIRANDA
4951,00002-2020-0-3301-JR-CI-01,DEMANDADO,KORAYMA ASUCENA CONTRERAS MIRANDA
4952,00002-2020-0-3301-JR-CI-01,DEMANDADO,LEYDI VIVIANA CONTRERAS MIRANDA


In [136]:
# Here, I'm just gambling that a foreign firm will appear in the data.

selected_block = selected_rows_df.iloc[2500, 2]
selected_block

'UNISON MARINE CORP  '

In [138]:
selected_rows_df.iloc[2500]

Expediente N°:     01261-2020-0-1809-JP-CI-02
Parte                               DEMANDADO
ApellidoMaterno          UNISON MARINE CORP  
Name: 2500, dtype: object

In [140]:
observation_to_include = "01261-2020-0-1809-JP-CI-02"

mask_include_observation = juridica_df['Expediente N°:'] == observation_to_include

new_dataset = juridica_df[mask_include_observation].copy()

new_dataset

Unnamed: 0,Expediente N°:,Parte,ApellidoMaterno
35068,01261-2020-0-1809-JP-CI-02,DEMANDADO,EMERALD SHIPPING HK CO LTD
35069,01261-2020-0-1809-JP-CI-02,DEMANDADO,UNISON MARINE CORP
35070,01261-2020-0-1809-JP-CI-02,DEMANDANTE,SEABOARD OVERSEAS PERU SA
