## Repse data validation

### Step 1. Load required packages

In [1]:
import pandas as pd
import pandasql as ps
from pathlib import Path

### Step 2 - Get sample data into dataframes

In [2]:
workFolder = '/content/drive/Shareddrives/Esperanto/Supports/REPSE'
df1 = pd.read_excel(Path(workFolder,'1738172642.xlsx'))

In [3]:
print(df1.head(10))

             RFC                NombreComercial  \
0  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
1  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
2  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
3  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
4  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
5  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
6  AAAM900204640  ALVARADO ARMENTA MIGUEL ANGEL   
7  AACB850212JL9    AYALA CUEVAS BRENDA JOCELYN   
8  AACB850212JL9    AYALA CUEVAS BRENDA JOCELYN   
9  AACB850212JL9    AYALA CUEVAS BRENDA JOCELYN   

                     RazonSocial                   Email cveplantilla  \
0  ALVARADO ARMENTA MIGUEL ANGEL     MAAR_60@HOTMAIL.COM      pFisica   
1  ALVARADO ARMENTA MIGUEL ANGEL     MAAR_60@HOTMAIL.COM      pFisica   
2  ALVARADO ARMENTA MIGUEL ANGEL     MAAR_60@HOTMAIL.COM      pFisica   
3  ALVARADO ARMENTA MIGUEL ANGEL     MAAR_60@HOTMAIL.COM      pFisica   
4  ALVARADO ARMENTA MIGUEL ANGEL     MAAR_60@HOTMAIL.COM      pFisica   


In [4]:
sql_query = '''
SELECT DISTINCT RFC,NombreComercial,RazonSocial,Email,cveplantilla,
Cargado,Rechazado,DocumentosRequeridos,DocumentosCargados,DocumentosFaltantes,DocumentosFaltantes
FROM df1
'''
repsedf = ps.sqldf(sql_query)
print(len(repsedf.index), ' records.')
repsedf['NombreComercial'] = repsedf['NombreComercial'].str.upper()
repsedf['RazonSocial'] = repsedf['RazonSocial'].str.upper()
print(repsedf.head(10))
repsedf.to_csv(Path(workFolder, 'listado_repse.csv'), encoding='utf-8')

1730  records.
             RFC                                  NombreComercial  \
0  AAAM900204640                    ALVARADO ARMENTA MIGUEL ANGEL   
1  AACB850212JL9                      AYALA CUEVAS BRENDA JOCELYN   
2  AACT421003NZ8                 \tMARIA TERESA ALFARO CABANILLAS   
3   AAE980220TL5                  AGENCIA ADUANAL ESQUER LUKEN SC   
4  AAGL740303BB9                        ALVAREZ GOMEZ LUIS DANIEL   
5  AAHJ900125497               JESSICA MONERRAT ALVAREZ HERNANDEZ   
6   AAI940623QCA                                 AAIRESA SA DE CV   
7   AAM090224BC2  COMPAÑIA ALBORADA DE AMÉRICA S. DE R.L. DE C.V.   
8   AAM090224BC2  COMPAÑIA ALBORADA DE AMÉRICA S. DE R.L. DE C.V.   
9  AANV811216BL8                      ARAIZA NAVA VICTOR EMMANUEL   

                                       RazonSocial  \
0                    ALVARADO ARMENTA MIGUEL ANGEL   
1                      AYALA CUEVAS BRENDA JOCELYN   
2                 \tMARIA TERESA ALFARO CABANILLAS   
3        

In [5]:
df2 = pd.read_excel(Path(workFolder,'Proveedor con tema de repse retenidos del sistema anterior.xlsx'))
df2['NOMBRE'] = df2['NOMBRE'].str.upper()
sql_query = '''
SELECT DISTINCT NOMBRE FROM df2
'''
suppliersDf = ps.sqldf(sql_query)
print(len(suppliersDf.index), ' records.')
suppliersDf.to_csv(Path(workFolder, 'proveedores.csv'), encoding='utf-8')
print(suppliersDf.head(10))

202  records.
                                              NOMBRE
0                 JUAN PABLO MIGUEL VALDIVIA VAZQUEZ
1                              VERONICA PRADO LLAMAS
2  SISTEMAS DE OPERACIONES INDUSTRIALES Y SUMINIS...
3                       JESUS ABRAHAM PORTILLO LOPEZ
4  SERVICIOS INTEGRALES Y ELECTRICOS DE SALTILLO ...
5                                  SAMUEL VEGA MURAD
6                               DANIEL ZAMORA ROMERO
7                       MILTON CARLOS RIVERA OBREGON
8                                    SIC DE VICTORIA
9      SANCHEZ CONSTRUCTORA DE TABASCO S DE RL DE CV


### Step 3 - Read enhanced data

In [41]:
repseDf = pd.read_csv(Path(workFolder,'listado_repse.csv'), encoding='utf-8')
suppliersDf = pd.read_csv(Path(workFolder,'proveedores.csv'), encoding='utf-8')

In [42]:
def removeCommasAndDots(tmpString):
    newString = tmpString.replace('.','').replace(',','')
    return newString

def removeAccents(tmpString):
    newString = tmpString.replace('Á','A').replace('É','E').replace('Í','I').replace('Ó','O').replace('Ú','U')
    return newString

def removeDoubleSpaces(tmpString):
    newString = tmpString.replace('  ',' ')
    return newString

repseDf['CampoValidacion'] = None
repseDf['AptoValidacion'] = True
for index, row in repseDf.iterrows():
    tmpString = str(row['NombreComercial']).strip()
    if len(tmpString)>1:
        tmpString = removeCommasAndDots(tmpString)
        tmpString = removeAccents(tmpString)
        tmpString = removeDoubleSpaces(tmpString)
        repseDf.loc[index, 'CampoValidacion'] = tmpString
        pass
    else:
        repseDf.loc[index, 'AptoValidacion'] = False
repseDf.to_csv(Path(workFolder, 'listado_repse_mejorado.csv'), encoding='utf-8')

In [40]:
print(repseDf.head(10))

   Unnamed: 0            RFC                                  NombreComercial  \
0           0  AAAM900204640                    ALVARADO ARMENTA MIGUEL ANGEL   
1           1  AACB850212JL9                      AYALA CUEVAS BRENDA JOCELYN   
2           2  AACT421003NZ8                 \tMARIA TERESA ALFARO CABANILLAS   
3           3   AAE980220TL5                  AGENCIA ADUANAL ESQUER LUKEN SC   
4           4  AAGL740303BB9                        ALVAREZ GOMEZ LUIS DANIEL   
5           5  AAHJ900125497               JESSICA MONERRAT ALVAREZ HERNANDEZ   
6           6   AAI940623QCA                                 AAIRESA SA DE CV   
7           7   AAM090224BC2  COMPAÑIA ALBORADA DE AMÉRICA S. DE R.L. DE C.V.   
8           8   AAM090224BC2  COMPAÑIA ALBORADA DE AMÉRICA S. DE R.L. DE C.V.   
9           9  AANV811216BL8                      ARAIZA NAVA VICTOR EMMANUEL   

                                       RazonSocial  \
0                    ALVARADO ARMENTA MIGUEL ANGEL   


In [43]:
suppliersDf['CampoValidacion'] = None
suppliersDf['AptoValidacion'] = True
for index, row in suppliersDf.iterrows():
    tmpString = str(row['NOMBRE']).strip()
    if len(tmpString)>1:
        tmpString = removeCommasAndDots(tmpString)
        tmpString = removeAccents(tmpString)
        tmpString = removeDoubleSpaces(tmpString)
        suppliersDf.loc[index, 'CampoValidacion'] = tmpString
        pass
    else:
        suppliersDf.loc[index, 'AptoValidacion'] = False
suppliersDf.to_csv(Path(workFolder, 'proveedores_mejorado.csv'), encoding='utf-8')

In [44]:
print(suppliersDf.head(10))

   Unnamed: 0                                             NOMBRE  \
0           0                 JUAN PABLO MIGUEL VALDIVIA VAZQUEZ   
1           1                              VERONICA PRADO LLAMAS   
2           2  SISTEMAS DE OPERACIONES INDUSTRIALES Y SUMINIS...   
3           3                       JESUS ABRAHAM PORTILLO LOPEZ   
4           4  SERVICIOS INTEGRALES Y ELECTRICOS DE SALTILLO ...   
5           5                                  SAMUEL VEGA MURAD   
6           6                               DANIEL ZAMORA ROMERO   
7           7                       MILTON CARLOS RIVERA OBREGON   
8           8                                    SIC DE VICTORIA   
9           9      SANCHEZ CONSTRUCTORA DE TABASCO S DE RL DE CV   

                                     CampoValidacion  AptoValidacion  
0                 JUAN PABLO MIGUEL VALDIVIA VAZQUEZ            True  
1                              VERONICA PRADO LLAMAS            True  
2  SISTEMAS DE OPERACIONES INDUSTRIALE