In [None]:
!pip install transtab
import transtab

Collecting transtab
  Downloading transtab-0.0.5-py3-none-any.whl (29 kB)
Collecting loguru (from transtab)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting openml>=0.10.0 (from transtab)
  Downloading openml-0.14.2.tar.gz (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting liac-arff>=2.4.0 (from openml>=0.10.0->transtab)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml>=0.10.0->transtab)
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collect



In [None]:
! pip install OpenAI



In [None]:
! pip install tiktoken



In [None]:
import openai
import tiktoken

In [None]:
! pip install sglang


Collecting sglang
  Using cached sglang-0.1.14-py3-none-any.whl (127 kB)
Installing collected packages: sglang
Successfully installed sglang-0.1.14


In [None]:
import sglang

In [None]:
!pip install openml



In [None]:
!pip install loguru



In [None]:
! pip install bigframes

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.7.1->bigframes)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [None]:
import os
import pdb

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import openml
from loguru import logger

from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
from collections import namedtuple


In [None]:
class AutomaticDataConfig:
    def __init__(self):
        pass

    def automatic_dataset_config(self, df):
        dataset_config = {
            'cat': [],
            'num': [],
            'bin': []
        }

        for col in df.columns:
            null_count = df[col].isnull().sum()
            unique_values = df[col].nunique()

            if unique_values == 2:
                if set(df[col].dropna().unique()) == {0, 1}:
                    dataset_config['bin'].append(col)
                else:
                    dataset_config['cat'].append(col)
            elif df[col].dtype == 'object' or unique_values < 10:
                dataset_config['cat'].append(col)
            else:
                dataset_config['num'].append(col)

            if null_count > 0:
                print(f"Column '{col}' has {null_count} missing values.")

        return dataset_config


In [None]:
class dataCleaner:
    def __init__(self):
       return None

    def inspect_columns(self, data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        '''
        Inspect columns of a pandas dataframe.
        @param df: pandas DataFrame. The dataframe to inspect columns of
        @return numerical_result: pandas DataFrame. Contains aggregated metrics for all numerical columns
        @return object_result: pandas DataFrame. Contains aggregated metrics for all object (string) columns
            * becomes None if there are no such columns
        '''
        df = data.copy()
        df_numerical = df.select_dtypes(include=[np.number])
        if df_numerical.shape[1] > 0:
          numerical_result = pd.DataFrame({
            'unique_count': df_numerical.nunique(),
            'has_null': df_numerical.isna().any(),
            'null_pct': df_numerical.isnull().mean() * 100,
            'zero_count': (df_numerical == 0).sum(),
            'positive_pct': (df_numerical > 0).mean() * 100,
            'negative_pct': (df_numerical < 0).mean() * 100,
            'mean': df_numerical.mean(),
            'median': df_numerical.median(),
            'std_dev': df_numerical.std(),
            'skewness': df_numerical.skew(),
            'kurtosis': df_numerical.kurt(),
            'min': df_numerical.min(),
            'max': df_numerical.max(),
            '1st_quantile': df_numerical.quantile(0.25),
            '2nd_quantile': df_numerical.quantile(0.50),
            '3rd_quantile': df_numerical.quantile(0.75),
            '1st_row': df_numerical.iloc[0],
            'random_row': df_numerical.sample(1).iloc[0],
            'last_row': df_numerical.iloc[-1],
            'dtype': df_numerical.dtypes
          }).round(3)
        df_object = df.select_dtypes(include=[object])
        if df_object.shape[1] > 0:
          object_result = pd.DataFrame({
            'unique_count': df_object.nunique(),
            'binary': df_object.nunique() == 2,
            'has_null': df_object.isna().any(),
            'null_pct': df_object.isnull().mean() * 100,
            'random_row': df_object.sample(1).iloc[0],
            'most_common_class': df_object.apply(lambda x: x.mode().iat[0]),
            'least_common_class': df_object.apply(lambda x: x.value_counts().idxmin()),
            '1st_row': df_object.iloc[0],
            'last_row': df_object.iloc[-1],
            'dtype': df_object.dtypes
          }).round(3)
        else:
          object_result = None
        return numerical_result, object_result

    def confirm_and_cast_data_types(self, df, dataset_config):
        transformed_columns = {}

        for col in dataset_config['cat'] + dataset_config['bin']:
            df[col] = df[col].astype('category')
            transformed_columns[col] = 'category'

        for col in dataset_config['num']:
            df[col] = df[col].astype('float')
            transformed_columns[col] = 'float'

        return df, transformed_columns

    def cleanData(self,
                  data:pd.DataFrame,
                  drop_null:bool=True,
                  drop_dup:bool=True,
                  replace_column_names:dict={},
                  filter:callable=None)-> pd.DataFrame:
        '''
        @param data: the pandas dataframe to be cleaned
        @param drop_null: whether or not to drop null rows in the dataframe
        @param drop_dup: whether or not to drop duplicate rows in the dataframe
        @param replace_column_names: dictionary of key, value where key is the name to be replaced
            and value is the corresponding new column name
        @param filter: custom filter to filter the dataset TODO
        @return: pd.Dataframe containing cleaned data
        '''
        if drop_null:
           data=data.dropna()
        if drop_dup:
           data=data.drop_duplicates()
        if filter is not None and callable(filter):
           data=data = filter(data)
        if replace_column_names:
           data=data.rename(columns=replace_column_names)
        return data

In [None]:
import pandas as pd
import os

class DatasetTransformer:
    def __init__(self, csv_url, dataset_description=""):
        """
        Initialize the DatasetTransformer with a CSV URL and optional dataset description.
        """
        self.csv_url = csv_url
        self.dataset_description = dataset_description

        try:
            self.data = csv_url
        except Exception as e:
            print(self.data)

        self.set_default_backend_for_SGLang()

    def set_default_backend_for_SGLang(self):
        """
        Set up the environment for using SGLang for generating meaningful names.
        """
        os.environ['OPENAI_API_KEY'] = # INSERT KEY
        set_default_backend(OpenAI("gpt-3.5-turbo"))

    def show_head(self):
        """
        Display the first 5 rows of the dataset.
        """
        return self.data.head()

    def transform_column_names(self, generation_example):
        """
        Use SGLang to generate meaningful column names based on the dataset description.
        """
        @function
        def multi_turn_inference(s, dataset_statement, generation_example, columns):
            s += system("You are a helpful assistant. Now you need to help me to understand the dataset\
                         and transfomaprm columns names to meaningful names with dataset description. "
                          + "The following is example of a medical dataset: "
                          + generation_example + "Now I will provide to you the data_description is: "
                          + dataset_statement)
            s += user("Now Create same mapping for the dataset.")
            for i, col in enumerate(columns):
                s += user(f'{col}:')
                s += assistant(gen(f'answer: {i}', max_tokens=50))

        state = multi_turn_inference.run(self.dataset_description, generation_example, self.data.columns)
        transformed_column_names = [state[f'answer: {i}'] for i in range(len(self.data.columns))]
        rename_dict = dict(zip(self.data.columns, transformed_column_names))
        self.data.rename(columns=rename_dict, inplace=True)

    def check_missing_values(self):
        """
        Check and print columns that contain NaN values.
        """
        if self.data.isna().sum().any() > 0:
            nan_columns = self.data.isna().sum()
            nan_columns = nan_columns[nan_columns > 0]
            nan_columns = nan_columns.index.tolist()
            print('There are NaN values in columns:', nan_columns)

    def identify_categorical_columns(self, threshold=10):
        """
        Identify and transform categorical columns based on a threshold for unique values.
        """
        need_transform_columns = {}
        for col in self.data.columns:
            if len(self.data[col].unique()) < threshold:
                need_transform_columns[col] = self.data[col].unique()
        return need_transform_columns

    def get_transformed_data(self):
        """
        Return the transformed dataset.
        """
        return self.data


In [None]:
def load_data(csv_file_path, dataset_config=None, encode_cat=False, seed=123):
    cleaner = dataCleaner()
    configurator = AutomaticDataConfig()

    dataframe = pd.read_csv(csv_file_path, nrows = 1000)

    all_data_list, train_data_list, val_data_list, test_data_list = [], [], [], []
    cat_cols_set, num_cols_set, bin_cols_set = set(), set(), set()

    # Clean the data
    cleaned_df = cleaner.cleanData(dataframe)

    # Automatically configure dataset
    configurator = AutomaticDataConfig()
    dataset_config = configurator.automatic_dataset_config(cleaned_df)

    # Cast data types based on the automatic configuration (if necessary)
    cleaned_df, transformed_columns = cleaner.confirm_and_cast_data_types(cleaned_df, dataset_config)

    # Generate meaningful column names
    dataset_description = "The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy." # TODO
    generation_example = "GenHlth is the results of this question: of Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor \
                          MentHlth is the results of this question: Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good? scale 1-30 days \
                          PhysHlth is the results of this question: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? scale 1-30 days" # TODO
    transformer = DatasetTransformer(cleaned_df, dataset_description=dataset_description)
    transformer.transform_column_names(generation_example)
    transformed_data = transformer.get_transformed_data()
    #print("Example of the generation:", transformer.show_head())
    print(transformed_data.columns)


    # Split the data into train, validation, and test sets
    target_col_name = transformed_data.columns[9]
    X = transformed_data.drop(columns=[target_col_name])
    y = transformed_data[target_col_name]
    print("target")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

    # Append processed datasets and column types to the lists
    all_data_list.append((X, y))
    train_data_list.append((X_train, y_train))
    val_data_list.append((X_val, y_val))
    test_data_list.append((X_test, y_test))
    cat_cols_set.update(dataset_config['cat'])
    num_cols_set.update(dataset_config['num'])
    bin_cols_set.update(dataset_config['bin'])


    # Return all the datasets and unique lists of column types
    return (
        all_data_list, train_data_list, val_data_list, test_data_list,
        list(cat_cols_set), list(num_cols_set), list(bin_cols_set)
    )

In [None]:
# step 1

allset, trainset, valset, testset, cat_cols, num_cols, bin_cols  = load_data("merged_final.csv")


Index(['ID: Unique identifier for each patient',
       'Gender: Gender of the patient (Male, Female)',
       'AGE: Age of the patient in years',
       'Hypertension: Whether the patient has been diagnosed with hypertension (Yes, No)',
       'Heart Disease: Whether the patient has been diagnosed with heart disease (Yes, No)',
       'Smoking History: The patient's history of smoking (Current smoker, Former smoker, Never smoked)',
       'BMI: Body Mass Index (BMI) calculated from the patient's height and weight readings',
       'HbA1c: Hemoglobin A1c (HbA1c) level of the patient, a measure of blood sugar control over the past 2-3 months',
       'Blood Glucose Level: Fasting blood glucose level of the patient',
       'Class: The health classification of the patient (Diabetes, Pre-diabetic, Healthy)',
       'Number of Patients: Total number of patients in the dataset',
       'Urea: Urea levels in the patient's blood, a measure of kidney function',
       'Cr: Creatinine levels in

In [None]:
# step 2
model = transtab.build_classifier(cat_cols, num_cols, bin_cols)



In [None]:
training_arguments = {
    'num_epoch':30,
    'eval_metric':'val_loss',
    'eval_less_is_better':True,
    'output_dir':'./checkpoint',
    'batch_size':128,
    'lr':1e-4,
    'weight_decay':1e-4,
    }
transtab.train(model, trainset, valset, **training_arguments)

# save model
model.save('./ckpt/pretrained')

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

epoch: 0, test val_loss: 0.277496
epoch: 0, train loss: 2.1608, lr: 0.000100, spent: 4.3 secs
epoch: 1, test val_loss: 0.229584
epoch: 1, train loss: 1.5983, lr: 0.000100, spent: 11.6 secs
epoch: 2, test val_loss: 0.227721
epoch: 2, train loss: 1.5541, lr: 0.000100, spent: 16.0 secs
epoch: 3, test val_loss: 0.227811
EarlyStopping counter: 1 out of 5
epoch: 3, train loss: 1.5750, lr: 0.000100, spent: 21.2 secs
epoch: 4, test val_loss: 0.227575
epoch: 4, train loss: 1.5369, lr: 0.000100, spent: 25.6 secs
epoch: 5, test val_loss: 0.229422
EarlyStopping counter: 1 out of 5
epoch: 5, train loss: 1.5229, lr: 0.000100, spent: 29.9 secs
epoch: 6, test val_loss: 0.232997
EarlyStopping counter: 2 out of 5
epoch: 6, train loss: 1.5300, lr: 0.000100, spent: 35.0 secs
epoch: 7, test val_loss: 0.236953
EarlyStopping counter: 3 out of 5
epoch: 7, train loss: 1.5594, lr: 0.000100, spent: 41.3 secs
epoch: 8, test val_loss: 0.239223
EarlyStopping counter: 4 out of 5
epoch: 8, train loss: 1.4998, lr: 0.0

[32m2024-04-06 18:47:02.430[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./checkpoint[0m
[32m2024-04-06 18:47:02.449[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./checkpoint[0m


epoch: 9, test val_loss: 0.238302
EarlyStopping counter: 5 out of 5
early stopped


[32m2024-04-06 18:47:02.639[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 51.2 secs.[0m


In [None]:
x_test = testset[0][0]
y_test = testset[0][1]
ypred = transtab.predict(model, x_test, y_test)

In [None]:
transtab.evaluate(ypred, y_test, metric='auc')

[0.671875]