# Experiencia

En este notebook vamos a unir los datos de experiencia de todos los funcionarios públicos de 2002--2018. Vamos a tomar en cuenta solamente la experiencia en el sector privado.

In [1]:
import os
import glob
import hashlib
from os.path import join
import numpy as np
import pandas as pd

In [2]:
DATA = "/home/rdora/declaranet/data"
# Tablas
GENERAL = "_generalesEncargo_DBP.csv"
EXP = "_experiencia_DBP.csv"
ECONO = "_participaciones_econo_DBP.csv"
PUESTO = "_puesto_cargo_com_DBP.csv"

In [7]:
# Columnas obtenidas del notebook `datos`
COLS_GENERAL = ["ACUSE",
                "NOMBRE",
                "TIPO_DECLARACION",
                "FECHA_ENVIO",
                "PUESTO",
                "DEPENDENCIA",
                "AREA_ADSCRIPCION",
                "HONORARIOS",
                "MAXIMO_GRADO_ESTUDIOS"]
COLS_EXP = ["ACUSE",
            "SECTOR",
            "INSTITUCION_EMPRESA",
            "AREA_O_UNIDAD_ADMINSITRATIVA",
            "PUESTO",
            "INGRESO_EGRESO"]

In [108]:
# Total de años del proyecto menos 2002
years = range(2003, 2019)

In [114]:
path_general = join(DATA, "2002", "2002" + GENERAL)
path_exp = join(DATA, "2002", "2002" + EXP)
df_exp = pd.read_csv(path_exp, usecols=COLS_EXP, encoding="utf-8")
df_general = pd.read_csv(path_general, usecols=COLS_GENERAL, encoding="utf-8")
df_general = df_general.rename(columns={"PUESTO": "PUESTO_ACTUAL"})

In [115]:
# Hacer el HASH
df_exp = df_exp.astype(str)
df_exp['hash'] = df_exp.iloc[:,1:].apply(
    lambda x: hashlib.md5("".join(x).encode()).hexdigest(), axis=1)

In [116]:
hashes = set(df_exp['hash'].values)

In [117]:
df = pd.merge(df_exp,
             df_general,
             how="left",
             on="ACUSE")

Empezar con los demás años

In [118]:
dfs = [df]
for year in years:
    print(year)
    path_general = join(DATA, str(year), str(year) + GENERAL)
    path_exp = join(DATA, str(year), str(year) + EXP)
    # if year != 2015:
    df_exp_next = pd.read_csv(path_exp, usecols=COLS_EXP, encoding="utf-8")
    #else:
    #    df_exp_next = pd.DataFrame(columns=COLS_EXP)
    df_general_next = pd.read_csv(path_general, usecols=COLS_GENERAL, encoding="utf-8")
    df_general_next = df_general_next.rename(columns={"PUESTO": "PUESTO_ACTUAL"})
    # df_exp_next = df_exp_next[df_exp_next.SECTOR=="PRIVADO"]
    # Hacer el HASH
    df_exp_next = df_exp_next.astype(str)
    df_exp_next['hash'] = df_exp_next.iloc[:,1:].apply(
        lambda x: hashlib.md5("".join(x).encode()).hexdigest(), axis=1)
    hashes_next = set(df_exp_next['hash'].values)
    df_exp_next = df_exp_next[~(df_exp_next.hash.isin(hashes_next & hashes))]
    df_next = pd.merge(df_exp_next, df_general_next, how="left", on="ACUSE")
    if df_next.NOMBRE.isna().any():
        print(f"This year has non matching acuses: {year}")
    dfs.append(df_next)
    hashes = hashes_next | hashes
df = pd.concat(dfs)

2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [119]:
# Drop duplicates in terms of hash
df = df.drop_duplicates(subset="hash")

### Limpieza de los datos

In [120]:
df = df[~(df.NOMBRE=="PRUEBA")]
df = df[~(df.NOMBRE=="PRUEBA PRUEBA PRUEBA")]

In [121]:
df.shape

(1810431, 15)

In [124]:
df.to_csv(join(DATA, "tables", "experience.csv"), index=False)