In [1]:
import os

import numpy as np
import pandas as pd

import joblib

from typing import Dict, Tuple

import yaml
import json

import warnings
warnings.filterwarnings("ignore")

In [2]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


# check columns with train
column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

# Import

In [3]:
data_test = pd.read_csv(evaluate['predict_path'])
data_test[:4]

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_4,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7
0,ID_0I0999N6S,2021.0,2021-09-20,57.0,,,Yes,2nd year in programme,108.0,Almost always,...,,,,,,,,,,
1,ID_GQ6ONJ4FP,2021.0,2021-10-21,54.0,2021-01-10,9.0,Yes,1st year in the programme,105.0,Almost always,...,,,,,,,,,,
2,ID_YZ76CVRW3,2021.0,2021-05-17,57.0,,,Yes,,101.5,Often,...,,,,,,,,,,
3,ID_BNINCRXH8,2022.0,2022-09-09,59.334702,,,,3rd year in programme,,Almost always,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Preprocessing

In [4]:
def replace_values(data: pd.DataFrame, map_change_columns: dict) -> pd.DataFrame:
    """
    Замена значений в датасете
    :param data: датасет
    :param map_change_columns: словарь с признаками и значениями
    :return: датасет
    """
    for cols, map_values in map_change_columns.items():
        for col in cols.split(', '):
            data[col] = data[col].map(map_values)
    return data

def obs_hygiene_rang(obs_col:str) -> str:
    """
    преобразование значений в признаках гигиенты
    :obs_col: значение признака
    :return: ранговое значение признака
    """
    obs_col = str(obs_col)
    if '0' in obs_col:
        return 0
    elif '1' in obs_col or '2' in obs_col or '8' in obs_col:
        return 3
    elif '3' in obs_col or '4' in obs_col or '5' in obs_col:
        return 2
    elif '6' in obs_col or '7' in obs_col or '97' in obs_col:
        return 1

def filna(data: pd.DataFrame) -> pd.DataFrame: 
    """
    заполнение пропусков в датасете
    :param data: датасет
    :return: датасет с заполненными пропусками
    """
    for col in data.columns:

        if data[col].dtype in ['int64', 'int32', 'float64']:
            data[col] = data[col].fillna(-99)

        else:
            if data[col].nunique() == 2:
                # если 2 уникальных значения выражаем их 0 и 1, пропуски заполняем -99
                data[col] = (data[col].dropna() == data[col].dropna().unique()[0]).astype(int)
                data[col] = data[col].fillna(-99)
            else:
                data[col] = data[col].fillna("No_info")
    return data


def transform_types(data: pd.DataFrame, change_type_columns: dict) -> pd.DataFrame:
    """
    Преобразование признаков в заданный тип данных
    :param data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return:
    """
    return data.astype(change_type_columns, errors="raise")


def drop_cols(data: pd.DataFrame, drop_cols: list) -> pd.DataFrame:
    """
    удаление колонок в датасете
    :param data: датасет
    :param drop_cols: список колонок для удаления
    :return: датасет
    """
    return data.drop(columns=drop_cols)


def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списка с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Разные признаки"
    return data[column_sequence]


def save_unique_train_data(
    data: pd.DataFrame, drop_cols: list, target_column: str, unique_values_path: str
) -> None:
    """
    Сохранение словаря с признаками и уникальными значениями
    :param drop_columns: список с признаками для удаления
    :param data: датасет
    :param target_column: целевая переменная
    :param unique_values_path: путь до файла со словарем
    :return: None
    """
    unique_df = data.drop(
        columns=drop_cols + [target_column], axis=1, errors="ignore")
    # создаем словарь с уникальными значениями для вывода в UI
    dict_unique = {key: unique_df[key].unique().tolist() for key in unique_df.columns}
    with open(unique_values_path, "w") as file:
        json.dump(dict_unique, file)

In [5]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Пайплайн по предобработке данных
    :param data: датасет
    :param flg_evaluate: флаг для evaluate
    :return: датасет
    """
    
    # drop columns
    data = data.drop(kwargs["drop_cols"], axis=1, errors="ignore")
    # проверка dataset на совпадение с признаками из train
    # либо сохранение уникальных данных с признаками из train
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs["unique_values_path"]
        )
    else:
        save_unique_train_data(
            data=data,
            drop_columns=kwargs["drop_cols"],
            target_column=kwargs["target_column"],
            unique_values_path=kwargs["unique_values_path"],
        )
        
    # преобразование значений в признаках
    data['obs_toilet'] = data.apply(lambda x: obs_hygiene_rang(x['obs_toilet']),axis=1)
    data['obs_handwashing'] = data.apply(lambda x: obs_hygiene_rang(x['obs_handwashing']),axis=1)
    data['id_dc_best'] = ['Rare' if x not in preproc["dc_freq"] else x for x in data['id_dc_best']]
    
    # drop columns
    data = data.drop(kwargs["drop_cols"], axis=1, errors="ignore")
    
    # replace values
    data = replace_values(data=data, map_change_columns=kwargs["map_change_columns"])
    
    # fillna
    data = filna(data)

    # change category types
    dict_category = {key: "category" for key in data.select_dtypes(["object"]).columns}
    data = transform_types(data=data, change_type_columns=dict_category)
    return data

In [6]:
dict(data_test.iloc[0])

{'child_id': 'ID_0I0999N6S',
 'data_year': 2021.0,
 'child_date': '2021-09-20',
 'child_age': 57.0,
 'child_enrolment_date': nan,
 'child_months_enrolment': nan,
 'child_grant': 'Yes',
 'child_years_in_programme': '2nd year in programme',
 'child_height': 108.0,
 'child_observe_attentive': 'Almost always',
 'child_observe_concentrated': 'Often',
 'child_observe_diligent': 'Almost always',
 'child_observe_interested': 'Almost always',
 'child_observe_total': 11.0,
 'child_gender': 'Female',
 'child_dob': '2016-12-08',
 'child_zha': 0.03815224,
 'child_stunted': 'Normal',
 'child_attends': nan,
 'child_attendance': nan,
 'child_languages': nan,
 'child_age_group': '50-59 months',
 'id_mn_best': 'LIM354',
 'prov_best': 'LIMPOPO',
 'id_dc_best': 'DC35',
 'dc_best': 'CAPRICORN',
 'mn_best': 'POLOKWANE',
 'ward_best': 8.0,
 'id_enumerator': 1732.0,
 'id_facility': 1878.0,
 'pqa_date': nan,
 'pqa_class_age': nan,
 'pqa_class_age_1': nan,
 'pqa_class_age_2': nan,
 'pqa_class_age_3': nan,
 'pqa

In [7]:
data_proc_test = pipeline_preprocess(data=data_test, **preproc)

In [8]:
data_proc_test[:4]

Unnamed: 0,data_year,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,...,phase_natemis,quintile,language_child,facility_type,sef_ind,elp_ind,gps_ind,pre_covid,quintile_used,ses_cat
0,2021.0,57.0,-99.0,Yes,2.0,108.0,3,2,3,3,...,PRIMARY SCHOOL,3.0,Sepedi,No_info,1,1.0,1.0,1,1.0,2.0
1,2021.0,54.0,9.0,Yes,1.0,105.0,3,3,2,3,...,PRIMARY SCHOOL,1.0,isiZulu,No_info,1,1.0,1.0,1,1.0,1.0
2,2021.0,57.0,-99.0,Yes,-99.0,101.5,2,2,2,2,...,No_info,1.0,isiZulu,ECD Centre,1,1.0,1.0,1,1.0,2.0
3,2022.0,59.334702,-99.0,No_info,3.0,-99.0,3,3,3,2,...,No_info,-99.0,No_info,No_info,1,-99.0,1.0,1,-99.0,1.0


# Evaluate

In [9]:
model = joblib.load(training['model_path'])
data_proc_test['target'] = model.predict(data_proc_test)

In [10]:
data_proc_test.head()

Unnamed: 0,data_year,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,...,quintile,language_child,facility_type,sef_ind,elp_ind,gps_ind,pre_covid,quintile_used,ses_cat,target
0,2021.0,57.0,-99.0,Yes,2.0,108.0,3,2,3,3,...,3.0,Sepedi,No_info,1,1.0,1.0,1,1.0,2.0,55.003346
1,2021.0,54.0,9.0,Yes,1.0,105.0,3,3,2,3,...,1.0,isiZulu,No_info,1,1.0,1.0,1,1.0,1.0,46.495744
2,2021.0,57.0,-99.0,Yes,-99.0,101.5,2,2,2,2,...,1.0,isiZulu,ECD Centre,1,1.0,1.0,1,1.0,2.0,45.85697
3,2022.0,59.334702,-99.0,No_info,3.0,-99.0,3,3,3,2,...,-99.0,No_info,No_info,1,-99.0,1.0,1,-99.0,1.0,64.232675
4,2021.0,54.0,8.0,Yes,1.0,103.5,2,2,2,3,...,1.0,Afrikaans,No_info,1,1.0,1.0,1,1.0,0.0,42.067758
