## Preprocessing raw data
In this example, we preprocess an exemplary tabular data. The dataset is the Wine Poland dataset, which contains information about wines on the polish market. 

In [None]:
# Set the current working directory and import packages
import os
from pathlib import Path
os.chdir(Path().cwd().parent)

import json
import ast
import pandas as pd
import numpy as np
from configs.directory import config_directory
from configs.carte_datalist import carte_datalist

In [None]:
# Define necessary functions

def _drop_high_null(data, proportion=0.5):
    """Drop columns with high fraction of missing values"""
    null_num = np.array([data[col].isnull().sum() for col in data.columns])
    null_crit = int(len(data) * proportion)
    null_col = list(data.columns[null_num > null_crit])
    return data.drop(columns=null_col)

def _drop_single_unique(data):
    """Drop columns with single unique values."""
    num_unique_cols = [col for col in data.columns if data[col].nunique() == 1]
    return data.drop(columns=num_unique_cols)

In [None]:
# basic info
target_name = "price"
entity_name = "name"
task = "regression"
repeated = False
# preprocess
data.dropna(subset=target_name, inplace=True)
data.reset_index(drop=True, inplace=True)
data[target_name] = np.log10(data[target_name])
data["vegan"] = data["vegan"].astype(str)
data["natural"] = data["natural"].astype(str)
data["vintage"] = data["vintage"].astype(str)
data["vintage"] = data["vintage"].str[:4]
temp = data["vintage"].copy()
temp[temp == "nan"] = np.nan
data["vintage"] = temp
data["volume"] = data["volume"]*1000
data = _drop_high_null(data)
data = _drop_single_unique(data)


In [None]:
# basic info
target_name = "popularity"
entity_name = "track"
task = "classification"
repeated = False
# preprocess
data.dropna(subset=target_name, inplace=True)
data.reset_index(drop=True, inplace=True)
data = _drop_high_null(data)
data = _drop_single_unique(data)
drop_cols = []
drop_cols.append("uri")
data.drop(columns=drop_cols, inplace=True)
data["time_signature"] = data["time_signature"].astype("str")
data["sections"] = data["sections"].astype("str")
data["key"] = data["key"].astype("str")
data["duration_ms"] = data["duration_ms"].astype("float")
temp = data["mode"].copy()
mapping = {1: "Major", 0: "Minor"}
temp = temp.map(mapping)
data["mode"] = temp