# data prep: zero_one_normalization and its regeneration with mins, max.


## normalization

In [1]:
import pandas as pd

def zero_one_normalization(df):
    """
    Applies 0-1 median normalization to each column of a pandas DataFrame.

    Args:
        df: A pandas DataFrame.

    Returns:
        A pandas DataFrame with normalized values, a Series of minimum values, and a Series of maximum values.
    """

    # Calculate median, minimum, and maximum for each column
    mins = df.min()
    maxs = df.max()

    # Apply 0-1 normalization
    normalized_df = (df - mins) / (maxs - mins)

    return normalized_df, mins, maxs

def regenerate_data(normalized_df, mins, maxs):
    original_df = (normalized_df * (maxs - mins)) + mins
    return original_df

In [2]:
input_csv =  'cal_dataframe.csv'
normalized_csv =  "cal_dataframe_normalized.csv"
mins_csv = "cal_dataframe_mins.csv"
maxs_csv = "cal_dataframe_maxs.csv"

In [None]:
# The raw data looks like
df = pd.read_csv(input_csv)  
df

In [None]:
# Normalize the data, then round it
normalized_df, mins, maxs = zero_one_normalization(df)
normalized_df = normalized_df.round(decimals=4)

normalized_df = normalized_df.sample(frac=1)


# Save the normalized data to a new CSV file
normalized_df.to_csv(normalized_csv, index=False)
mins.to_csv(mins_csv, index=False)
maxs.to_csv(maxs_csv, index=False)

normalized_df

In [None]:
# Regenerate the original data, as verification
original_df = regenerate_data(normalized_df, mins, maxs)
original_df


In [6]:
import csv

def csv_to_sentences(input_file, output_file):

    with open(input_file, 'r') as csvfile, open(output_file, 'w', newline='') as outfile:
        reader = csv.DictReader(csvfile)
        writer = csv.writer(outfile)

        for row in reader:
            # sentence = ", ".join(f"Column {index}: {round(float(value),4)}" for index, value in enumerate(row.values())) + " "
            sentence = ", ".join(f"Column {index}: {str(round(float(value),4)).ljust(6,'0')}" for index, value in enumerate(row.values())) + " "
            writer.writerow([sentence])



In [7]:
output_csv = "cal_dataframe_result.csv"
csv_to_sentences(normalized_csv, output_csv)

In [8]:
def read_csv_into_list(filename):
    """
    Reads a CSV file into a list, where each row (excluding the header) is an element.

    Args:
        filename (str): The path to the CSV file.

    Returns:
        list: A list containing the rows of the CSV file.
    """

    data = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)  # Skip the header row
        for row in reader:
            # print('row', row)
            data.append(row[0])
    return data

# building a 4-digit-number friendly tokenizer based on <bert-based-uncased> model.
https://huggingface.co/learn/nlp-course/chapter6/2

In [11]:
output_txt  = "4_digit_integer_others.txt"
with open(output_txt, "w") as f:
    for i in range(1000,int(1e4)):
        f.write(str(i) + "\n")
    
    for i in range(int(1e3)):
        f.write(str(i).zfill(4)  + "\n")
        
    f.write('Column' + '\n')
    f.write(':' + '\n')
    f.write(',' + '\n')
    f.write(' ' + '\n')
    f.write('.' + '\n')
    # f.write('[UNK]' + '\n')

data_list = read_csv_into_list(output_txt)

def get_training_corpus():
    for start_idx in range(0, len(data_list), 500):
        samples = data_list[start_idx : start_idx + 500]
        # print('samples', samples)
        yield samples

In [None]:
# from transformers import BertTokenizer
# # old_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# old_tokenizer = BertTokenizer.from_pretrained("../pretrainedModels_and_archivedFiles_and_stuffs/bert-base-uncased")


from transformers import AutoTokenizer
# old_tokenizer = AutoTokenizer.from_pretrained("../pretrainedModels_and_archivedFiles_and_stuffs/bert-base-uncased")
old_tokenizer = AutoTokenizer.from_pretrained("./bert-base-uncased")


In [None]:
# example = data_list[177]
# example = "This is a text with 123.1279 numbers."
# example = "Column 0: 3.2705, Column 1: 52.0, Column 2: 4.7725, Column 3: 1.0245, Column 4: 1504.0 "
# example = "Column 0: 3.2705, Column 1: 52.0, Column 2: 4.7799, Column 3: 1.0245, Column 4: 1504.0 "

example = pd.read_csv("cal_dataframe_result.csv",header=None).iloc[[2,3,4,5,6,7,8]]
# example = pd.read_csv("cal_dataframe_result.csv",header=None).iloc[[2,3],:]
print(example)

In [None]:
# tokens = old_tokenizer.tokenize(example)
# tokens

for index, row in example.iterrows():
    tk = old_tokenizer.tokenize(row[0])
    print(tk)


, which is bad. I need numbers clearly cut from text. Let us train a new tokenizer.

some other references that i dont fllow.
https://github.com/huggingface/tokenizers/blob/main/bindings/python/examples/train_bert_wordpiece.py
https://discuss.huggingface.co/t/dealing-with-decimal-and-fractions/23377/2

In [None]:
from collections import Counter

vocab_file = "4_digit_integer_others.txt"
additional_tokens = ['.', ' ', 'Column', ':']

with open(vocab_file, 'r') as f:
    vocab_list = [line.strip() for line in f]


new_tokenizer = AutoTokenizer.from_pretrained("./bert-base-uncased", vocab_file=vocab_file)
######
# https://stackoverflow.com/questions/60914793/argument-never-split-not-working-on-bert-tokenizer
######

new_tokenizer.add_tokens((vocab_list))
new_tokenizer.never_split = vocab_list

for index, row in example.iterrows():
    tk = new_tokenizer.tokenize(row[0])
    print(tk)


In [None]:
new_tokenizer.save_pretrained("the_tokenizer_the")