In [1]:
PROCESS_SAV = False

In [2]:
import os
os.chdir('..')
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np, pandas as pd, pickle
import savReaderWriter as spss
from tableschema_spss import Storage
from pathlib import Path
from tqdm import tqdm_notebook, tnrange
pd.set_option('display.max_columns', 100)

In [4]:
RAW = Path('data/raw/encuesta_relaciones_sociales/')
INT = Path('data/interim/encuesta_relaciones_sociales/')
PROC = Path('data/processed/encuesta_relaciones_sociales/')

In [5]:
files = [([f for f in e.iterdir() if f.suffix == '.sav'][0]) for e in RAW.iterdir() if e.is_dir()]
files[:3]

[PosixPath('data/raw/encuesta_relaciones_sociales/503-Modulo764/08_CRS02_CAP400.sav'),
 PosixPath('data/raw/encuesta_relaciones_sociales/503-Modulo762/06_CRS02_CAP200.sav'),
 PosixPath('data/raw/encuesta_relaciones_sociales/503-Modulo768/12_CRS03_CAP300.sav')]

# Read files

In [6]:
def read_file(f):
    storage = Storage()
    data = storage.read(f)
    fields = storage.describe(f)
    col_dict = {e['name']:e['title'] for e in fields['fields']}
    df = pd.DataFrame(data, columns=[e['name'] for e in fields['fields']])
    
    with spss.SavReader(f, returnHeader=True) as reader:
        label_info = {k.decode('latin-1'):{int(kk):vv.decode('latin-1') for kk,vv in v.items()} for
                      k,v in reader.valueLabels.items()}
        
    for n,col in df.items():
        if n in label_info:
            lbl_min = min(label_info[n].keys())
            
            # Hard fixes
            if 5 in label_info[n]:
                if label_info[n][5] == 'No supieron cómo ayudarme| 6. Otro (Especifique)':
                    label_info[n][5] = 'No supieron cómo ayudarme'
                    label_info[n][6] = 'Otro (Especifique)'
            # /Hard fixes
            
            lbl_values = [label_info[n][e+lbl_min] for e in range(len(label_info[n]))]
            df[n] = pd.Categorical(col.map(label_info[n]), lbl_values)
    
    return df, col_dict

def get_columns(file):
    with spss.SavHeaderReader(file) as header:
        labels = header.varLabels
    labels = [e.decode('latin-1') for e in labels.values()]
    return labels

def process_files(files):
    INT.mkdir(parents=True, exist_ok=True)
    col_dicts = []
    for f in tqdm_notebook(files):
        df, col_dict = read_file(f)
        col_dicts.append(col_dict)
        df.to_pickle(INT / (f.stem+'.pkl'))
        
    final_dict = col_dicts[0]
    for e in col_dicts: final_dict.update(e)
    pickle.dump(final_dict, (INT / 'full_dict.pkl').open('wb'))
    
    lines = sum([['-'*80, e.stem, '-'*80]+get_columns(e) for e in files], [])
    (INT / 'columns.txt').open('w').write('\n'.join(lines))
        

In [7]:
if PROCESS_SAV:
    process_files(files)

In [10]:
final_dict = pickle.load((INT / 'full_dict.pkl').open('rb'))