In [6]:
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm

def load_mapping(file_path, input_columns):
    mapping_df = pd.read_csv(file_path, index_col=0)
    mapping_dicts = {}
    for col in tqdm(mapping_df.columns, desc="Loading file"):
        if col in input_columns:
            mapping_dicts[col] = mapping_df[col].dropna().to_dict()
    return mapping_dicts

def should_remove(row, mapping_dicts):
    return any(mapping_dict.get(row[col_name]) == "Unknown" for col_name, mapping_dict in mapping_dicts.items())

def replace_codes(df, mapping_dicts):
    for col_name, mapping_dict in tqdm(mapping_dicts.items(), desc="Replacing integers with strings"):
        if col_name in df.columns:
            df[col_name] = df[col_name].replace(mapping_dict)
    return df

# Read the input data
input_data_file_path = input("Enter the path to the dataset CSV file: ")
df = pd.read_csv(input_data_file_path)

# Load mapping dictionaries from a CSV file
mapping_file_path = input("Enter the path to the mapping CSV file: ")
mapping_dicts = load_mapping(mapping_file_path, df.columns)

# Filter out rows with "Unknown" values
print("Dataset filtering started")
df = df[~df.apply(lambda row: should_remove(row, mapping_dicts), axis=1)]
print("Dataset filtering completed")

# Save integer data
df.to_csv('int_dataset.csv', index=False)
print("Integer dataset file saved")

# Replace numerical codes with string descriptions
df = replace_codes(df, mapping_dicts)

# Save string data file
df.to_csv('str_dataset.csv', index=False)
print("String dataset file saved")

Enter the path to the dataset CSV file: /Users/islambabaev/Google Drive/Programming/HARA Automation/usdb_analysis/combined_data.csv
Enter the path to the mapping CSV file: /Users/islambabaev/Google Drive/Programming/HARA Automation/usdb_analysis/mapping.csv


Loading file: 100%|██████████| 24/24 [00:00<00:00, 4302.59it/s]


Dataset filtering started
Dataset filtering completed


Replacing integers with strings:  10%|▉         | 2/21 [00:00<00:01, 16.74it/s]

Integer dataset file saved


Replacing integers with strings: 100%|██████████| 21/21 [00:00<00:00, 29.26it/s]


String dataset file saved
