In [1]:
from utils import base_configs, deps, tr_va_te_split
from utils.helpers import dir_helpers, rw_csv_helpers, feature_distr_helpers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import import_ipynb
import pandas as pd
import numpy as np

In [3]:
## HELPERS ##
# Drop rows with any negative values in numeric columns
def drop_neg(df, verbose = 0):
    df = df[~(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]
    if(verbose > 0):
        print(f"After cleaned size: {df.shape}")
    if(verbose > 1):
        display(df.sample(3))
                
    return df

def cols_encode_basic(df, cols_to_encode_val, verbose = 0):

    # Replace 2 with 0 in all specified columns
    existing_val = 2
    new_val = 0
    df[cols_to_encode_val] = df[cols_to_encode_val].replace(existing_val, new_val)

    if(verbose > 0):
        BOLD, RESET = "\033[1m", "\033[0m"
        print(f"{BOLD} Enoding rule in provided columns:{RESET} [Existing value = {existing_val} => to => New value = {new_val}]")
        print(f"{BOLD} Columns being encoded:{RESET} {cols_to_encode_val}")
        print(f"{BOLD} Data shape:{RESET} {df.shape}")

    return df

def encode_cols(df, col_name, label_list, verbose = 0):
    df = df.copy()  # avoid chained-assignment warnings
    for code, label in label_list.items():
        new_col_name = f"{col_name}_{label}"
        df[new_col_name] = (df[col_name] == code).astype("int8")
    df.drop(columns=[col_name], inplace=True)
    if verbose > 0:
        created = [f"{col_name}_{label}" for label in label_list.values()]
        print(f"Encoded '{col_name}' into {len(created)} columns: {created}")
        print(f"Data shape: {df.shape}")
    return df

def encode_cols_wrapper(df, verbose = 0):
    cols_to_map = {
        "GeneralHealth": {
            1: "Excellent",
            2: "VeryGood",
            3: "Good",
            4: "Fair",
            5: "Poor",
        },
        "smokeStat": {
            1: "Current",
            2: "Former",
            3: "Never",
        },
        "eCigUse": {
            1: "Current",
            2: "Former",
            3: "Never",
        },
    }
    
    if(verbose > 0):
        print(f"Initial data shape: {df.shape}")
        
    for key, value in cols_to_map.items():
        if key in df.columns:
            df = encode_cols(df.copy(), key, value, verbose = verbose)

    return df


### 0 Data load

In [4]:
hints6_data = "ip/1_colFiltered/hints6_colFiltered.csv"
hints7_data = "ip/1_colFiltered/hints7_colFiltered.csv"
hints6_7_data = "ip/1_colFiltered/hints6_7_union_colFiltered.csv"
df_orig = rw_csv_helpers.read_csv_file(hints6_7_data, verbose = 1)

Loaded: /home/ppanta/puru_proj/proj_v0/hints6_v0/ip/1_colFiltered/hints6_7_union_colFiltered.csv
────────────────────────────────────────────────────────────────────────────────
Shape: (13530, 18)
────────────────────────────────────────────────────────────────────────────────
All columns: ['FreqGoProvider', 'GeneralHealth', 'Deaf', 'MedConditions_Diabetes', 'MedConditions_HighBP', 'MedConditions_HeartCondition', 'MedConditions_LungDisease', 'MedConditions_Depression', 'AverageTimeSitting', 'EverHadCancer', 'Age', 'BirthGender', 'BMI', 'smokeStat', 'PHQ4', 'WeeklyMinutesModerateExercise', 'eCigUse', 'AvgDrinksPerWeek']


### 1 Clean data

In [5]:
df_cleaned = drop_neg(df_orig.copy(), verbose=1)

After cleaned size: (10581, 18)


In [6]:
# List of medical condition columns
med_cols = [
    'Deaf',
    'MedConditions_Diabetes',
    'MedConditions_HighBP',
    'MedConditions_HeartCondition',
    'MedConditions_LungDisease',
    'MedConditions_Depression',
    'EverHadCancer',
    'BirthGender'
]
df_cleaned_encoded1 = cols_encode_basic(df_cleaned.copy(), med_cols, 1)

[1m Enoding rule in provided columns:[0m [Existing value = 2 => to => New value = 0]
[1m Columns being encoded:[0m ['Deaf', 'MedConditions_Diabetes', 'MedConditions_HighBP', 'MedConditions_HeartCondition', 'MedConditions_LungDisease', 'MedConditions_Depression', 'EverHadCancer', 'BirthGender']
[1m Data shape:[0m (10581, 18)


In [7]:
# All column names: (df.columns = ...)
# FreqGoProvider, GeneralHealth, Deaf, MedConditions_Diabetes, MedConditions_HighBP,
# MedConditions_HeartCondition, MedConditions_LungDisease, MedConditions_Depression,
# AverageSleepNight, AverageTimeSitting, EverHadCancer, Age, BirthGender, BMI,
# smokeStat, PHQ4, WeeklyMinutesModerateExercise, eCigUse, AvgDrinksPerWeek

In [8]:
df_cleaned_encoded1_encoded2 = encode_cols_wrapper(df_cleaned_encoded1.copy(), verbose = 1)

Initial data shape: (10581, 18)
Encoded 'GeneralHealth' into 5 columns: ['GeneralHealth_Excellent', 'GeneralHealth_VeryGood', 'GeneralHealth_Good', 'GeneralHealth_Fair', 'GeneralHealth_Poor']
Data shape: (10581, 22)
Encoded 'smokeStat' into 3 columns: ['smokeStat_Current', 'smokeStat_Former', 'smokeStat_Never']
Data shape: (10581, 24)
Encoded 'eCigUse' into 3 columns: ['eCigUse_Current', 'eCigUse_Former', 'eCigUse_Never']
Data shape: (10581, 26)


### 3 Save data

In [9]:
hints6_cleaned_encoded = "ip/3_cleanedEncoded/hints6_cleaned_encoded.csv"
hints7_cleaned_encoded = "ip/3_cleanedEncoded/hints7_cleaned_encoded.csv"
hints6_7_cleaned_encoded = "ip/3_cleanedEncoded/hints6_7_cleaned_encoded.csv"
csv_path = rw_csv_helpers.write_csv_file(hints6_7_cleaned_encoded, df_cleaned_encoded1_encoded2, verbose = 1)

Saved: /home/ppanta/puru_proj/proj_v0/hints6_v0/ip/3_cleanedEncoded/hints6_7_cleaned_encoded.csv
shape: (10581, 26)
