# Setup

In [1]:
import os
import json
import re
import pandas as pd
import pyarrow
from sklearn.preprocessing import LabelEncoder

In [2]:
def create_dir(dir):
    try:
       os.makedirs(dir)
    except FileExistsError:
       pass

In [3]:
def import_dict(metadatapath):
    with open(metadatapath) as myfile:
    	indep_contents = myfile.read()
    return json.loads(indep_contents)

In [4]:
def extract_dict_cat(indep_dict):
    return {attr: info for (attr, info) in indep_dict.items() if indep_dict[attr]['type'] == 'Categorical'}

def extract_dict_cont(indep_dict):
    return {attr: info for (attr, info) in indep_dict.items() if indep_dict[attr]['type'] == 'Continuous'}

In [5]:
def sort_cols(df_indep, indep_dict):
    sorted_cols = sorted(df_indep.head(), 
                         key=lambda attr: indep_dict_enc[attr]['type'],
                         reverse=True)
    return df_indep[sorted_cols]

In [6]:
def indep_info(df_indep, indep_dict):
    df_info = pd.DataFrame({'variable': df_indep.head().columns})
    df_info['type'] = df_info['variable'].apply(lambda attr: indep_dict[attr]['type'])
    minmax = df_indep.agg(['min','max']).values.tolist()
    df_info['min'] = minmax[0]
    df_info['max'] = minmax[1]
    del minmax
    return df_info

def count_info(df_info):
    df_count = df_info.groupby('type').count().reset_index()[['type','variable']]
    df_count.rename(columns = {'variable': 'count'}, inplace=True)
    df_count.sort_values('type', ascending=False, inplace=True, ignore_index=True)
    return df_count

In [7]:
def export_json(dictfile, jsonfile):
    with open(jsonfile, 'w', encoding='utf-8') as f:
        json.dump(dictfile, f, ensure_ascii=False, indent=4)

def export_txt(string, txtfile):
    f = open(txtfile, 'w')
    f.write(string)
    f.close()

In [8]:
@pd.api.extensions.register_dataframe_accessor("data")
class Data:
    def __init__(self, pandas_obj, indep_dict):
        self.dataset = pandas_obj
        self.metadata = indep_dict
    
    def encodecat(self):
        cat_change = ""
        for attr in self.metadata.keys():
            if self.metadata[attr]['type'] == 'Categorical':
                le = LabelEncoder()
                le.fit(self.dataset[attr])
                self.dataset[attr] = list(le.transform(self.dataset[attr]).astype('int8'))
                newkeys = list()
                unseen = 0
                for strval in self.metadata[attr]['values'].keys():
                    try:
                        newkeys.append(int(le.transform([int(strval)])))
                    except ValueError: # for previously unseen labels
                        unseen -= 1
                        newkeys.append(unseen)
                if list(self.metadata[attr]['values'].keys()) != newkeys:
                    cat_change += attr+"\n"
                newdict = {key: val for key, val in zip(newkeys, self.metadata[attr]['values'].values())}
                self.metadata[attr]['values'] = newdict
        return cat_change[0:-1]
    
    def encodecont(self):
        pattern = r'(^|[^\w])(niu|universe)([^\w]|$)'
        pattern = re.compile(pattern, re.IGNORECASE)
        cont_nonpos = ""
        for attr in self.metadata.keys():
            if self.metadata[attr]['type'] == 'Continuous':
                flag = False
                for strval in self.metadata[attr]['values'].keys():
                    if not flag:
                        try:
                            if int(strval) <= 0:
                                text = self.metadata[attr]['values'][strval]
                                matches = re.search(pattern, text.replace(',', ' ').lower())
                                if bool(matches):
                                    flag = True
                                    cont_nonpos += attr+"\n"
                                    self.dataset[attr] = self.dataset[attr].apply(lambda v: 0 if v < 0 else v)
                                    break
                        except:
                            pass
                    if flag:
                        try:
                            if int(strval) <= 0:
                                self.metadata[attr]['values'].pop(strval, None)
                        except:
                            pass
                if flag:
                    self.metadata[attr]['values']['0'] = 'NIU'
        return cont_nonpos[0:-1]

# Given Information

In [9]:
year = 20

In [10]:
indep_dict = import_dict(metadatapath=f"../metadata/full/meta-indep-{year}.json")
dep_attrs = ['GRP', 'DIR', 'PUB']

In [11]:
class_attrs = ['class_orig','code_orig','code','class']

# Created Directories

In [12]:
create_dir("../encode")
create_dir("../metadata/new")
create_dir("../processed")
create_dir("../info")

# Exported Useful Information

In [13]:
export_json(extract_dict_cat(indep_dict), f"../metadata/full/meta-indep-cat-{year}.json")
export_json(extract_dict_cont(indep_dict), f"../metadata/full/meta-indep-cont-{year}.json")

In [14]:
if not os.path.isfile(f"../original/pppub{year}.csv"):
    df.to_csv(f"../original/pppub{year}.csv", index=False)

# Encoded Dataset and Dictionary

In [15]:
df = pd.read_feather(f"../original/pppub{year}.feature")
data_obj = Data(df.copy(), indep_dict.copy())

In [16]:
cat_var_change = data_obj.encodecat()
cont_var_nonpos = data_obj.encodecont()

In [17]:
df_enc = data_obj.dataset
indep_dict_enc = data_obj.metadata

# Processed Dataset

In [18]:
df_proc_enc = df_enc.drop(columns=['COV']+dep_attrs+class_attrs)
df_proc_enc = sort_cols(df_proc_enc, indep_dict_enc).join(df_enc['class'])

In [19]:
# Index starts at 0
df_proc_info = indep_info(df_proc_enc.loc[:, df_proc_enc.columns != 'class'],
                          indep_dict_enc)

In [20]:
df_count_info = count_info(df_proc_info)

# Exported Results

In [21]:
df_enc.to_feather(f"../encode/pppub{year}enc.feature")
df_enc.to_csv(f"../encode/pppub{year}enc.csv", index=False)
export_json(extract_dict_cat(indep_dict_enc), f"../metadata/new/meta-indep-cat-{year}-enc.json")

In [22]:
df_proc_enc.to_csv(f"../processed/proc{year}enc.csv", header=True, index=False)
df_proc_enc.to_csv(f"../processed/proc{year}encnoh.csv", header=False, index=False)

In [23]:
# Index starts at 1
df_proc_info.index = df_proc_info.index + 1
df_proc_info.to_csv(f"../info/proc{year}info.csv", index_label="id")
df_proc_info.to_csv(f"../info/proc{year}infonoh.csv", index_label="id", header=False)
df_count_info.to_csv(f"../info/proc{year}countinfo.csv", header=True, index=False)

In [24]:
export_txt(cat_var_change, f"../metadata/new/cat-change-{year}.txt")
export_txt(cont_var_nonpos, f"../metadata/new/cont-nonpos-{year}.txt")