In [1]:
import eda
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as matp
import matplotlib.gridspec as gridspec
import re, gc, pickle, json
import configuration as configuration


from tqdm.auto import tqdm
from transformers import AutoTokenizer
from wordcloud import WordCloud, STOPWORDS
from typing import List, Tuple, Dict, Callable, Any
from preprocessing import jsonl_to_json, load_all_types_dataset, stratified_kfold, check_null, null2str, sequence_length
%matplotlib inline

In [3]:
""" LCS Test """


gen_ngram = ['i', 'am', 'a', 'girl']
ref_ngram = ['i', 'am', 'a', 'girl']

def cal_longest_common_sequence() -> int:
    """ calculating length of longest common sequence between generated text and reference text """
    result = 0
    rows, cols = len(gen_ngram) + 1, len(ref_ngram)+1

    dp = [[0]*cols for _ in range(rows)]
    for y in range(1, rows):
        for x in range(1, cols):
            if gen_ngram[y-1] == ref_ngram[x-1]:
                dp[y][x] = dp[y-1][x-1] + 1
                result = max(result, dp[y][x])
                continue
            
            dp[y][x] = max(dp[y-1][x], dp[y][x-1])
            
    return result

cal_longest_common_sequence()

In [2]:
df = pd.read_csv('./data_folder/commerce/amazon/meta_grocery_gourmet_food_asin_db.csv')
df

In [6]:
df.sub_category.value_counts()

In [2]:
""" Merge Two different categories of dataset in amazon review data """

beauty_df, fashion_df = load_all_types_dataset('./data_folder/amazon_review/beauty.json'), load_all_types_dataset('./data_folder/amazon_review/fashion.json')
beauty_df['domain'], fashion_df['domain'] = 'beauty', 'fashion'
df = pd.concat([beauty_df, fashion_df], axis=0).reset_index(drop=True)
df = stratified_kfold(df, 'rating', configuration.CFG)
df.to_csv('train.csv', index=False, encoding='utf-8')

In [2]:
df = load_all_types_dataset('./data_folder/commerce/amazon/train.csv')
df

In [3]:
""" Null Checker Function """

df = null2str(df)
check_null(df)

In [1]:
df['length'] = sequence_length(configuration.CFG, df['text'])

In [2]:
""" configuration for preprocessing and eda """


class CFG:
    seed = 42
    tokenizer_name = 'microsoft/deberta-v3-large'  # later, remove this line
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    max_len = 512
    split_ratio = 0.2
    n_folds = 5

In [3]:
""" Helper Function for preprocessing """

def group_texts(cfg: configuration.CFG, sequences: Dict) -> Dict:
    """ Dealing Problem: some of data instances are longer than the maximum input length for the model,
    This function is ONLY used to HF Dataset Object

    1) Concatenate all texts
    2) We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    3) customize this part to your needs
    4) Split by chunks of max_len

    """
    concatenated_sequences = {k: sum(sequences[k], []) for k in sequences.keys()}
    total_length = len(concatenated_sequences[list(sequences.keys())[0]])
    if total_length >= cfg.max_seq:
        total_length = (total_length // cfg.max_seq) * cfg.max_seq
    result = {
        k: [t[i: i + cfg.max_seq] for i in range(0, total_length, cfg.max_seq)]
        for k, t in concatenated_sequences.items()
    }
    return result

def tokenizing(cfg: configuration.CFG, text: str, padding: bool or str = 'max_length') -> Any:
    """ Preprocess text for LLM Input, for common batch system

    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type
        padding: padding options, default 'max_length', if you want use smart batching, init this param to False
    """
    inputs = cfg.tokenizer.encode_plus(
        text,
        max_length=cfg.max_len,
        padding=padding,
        truncation=False,
        return_tensors=None,
        add_special_tokens=False,  # later, we will add ourselves
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v)
    return inputs

def adjust_sequences(sequences: List, max_len: int):
    """ Similar to dynamic padding concept
    Append slicing index from original, because original source code is implemented weired
    So it generates some problem for applying very longer sequence
    Add -1 value to slicing index, so we can get result what we want

    Args:
        sequences: list of each cell's token sequence in one unique notebook id, must pass tokenized sequence input_ids
        => sequences = [[1,2,3,4,5,6], [1,2,3,4,5,6], ... , [1,2,3,4,5]]
        max_len: max length of sequence into LLM Embedding Layer, default is 2048 for DeBERTa-V3-Large

    Reference:
         https://github.com/louis-she/ai4code/blob/master/ai4code/utils.py#L70
    """
    length_of_seqs = [len(seq) for seq in sequences]
    total_len = sum(length_of_seqs)
    cut_off = total_len - max_len
    if cut_off <= 0:
        return sequences, length_of_seqs

    for _ in range(cut_off):
        max_index = length_of_seqs.index(max(length_of_seqs))
        length_of_seqs[max_index] -= 1
    sequences = [sequences[i][:l-1] for i, l in enumerate(length_of_seqs)]
    return sequences, length_of_seqs

def subsequent_tokenizing(cfg: configuration.CFG, text: str) -> Any:
    """ Tokenize input sentence to longer sequence than common tokenizing
    Append padding strategy NOT Apply same max length, similar concept to dynamic padding
    Truncate longer sequence to match LLM max sequence

    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type

    Reference:
        https://www.kaggle.com/competitions/AI4Code/discussion/343714
        https://github.com/louis-she/ai4code/blob/master/tests/test_utils.py#L6
    """
    inputs = cfg.tokenizer.encode_plus(
        text,
        padding=False,
        truncation=False,
        return_tensors=None,
        add_special_tokens=False,  # No need to special token to subsequent text sequence
    )
    return inputs['input_ids']


def find_index(x: np.ndarray, value: np.ndarray) -> int:
    """ Method for find some tensor element's index

    Args:
        x: tensor object, which is contained whole tensor elements
        value: element that you want to find index
    """
    tensor_index = int(np.where(x == value)[0])
    return tensor_index


def subsequent_decode(cfg: configuration.CFG, token_list: List) -> Any:
    """ Return decoded text from subsequent_tokenizing & adjust_sequences
    For making prompt text

    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        token_list: token list from subsequent_tokenizing & adjust_sequences
    """
    output = cfg.tokenizer.decode(token_list)
    return output


def sequence_length(cfg: configuration.CFG, text_list: List) -> List:
    """ Get sequence length of all text data for checking statistics value
    """
    length_list = []
    for text in tqdm(text_list):
        tmp_text = tokenizing(cfg, text)['attention_mask']
        length_list.append(torch.eq(tmp_text, 1).sum().item())  # not need to tensor, but need to scalar
    return length_list

def check_null(df: pd.DataFrame) -> pd.Series:
    """ check if input dataframe has null type object...etc
    """
    return df.isnull().sum()

def no_char(text):
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\s+[a-zA-Z]$", " ", text)
    return text


def no_multi_spaces(text):
    return re.sub(r"\s+", " ", text, flags=re.I)


def underscore_to_space(text: str):
    text = text.replace("_", " ")
    text = text.replace("-", " ")
    return text


def preprocess_text(source):
    """ Remove all the special characters
    """
    source = re.sub(r'\W', ' ', str(source))
    source = re.sub(r'^b\s+', '', source)
    source = source.lower()
    return source


def cleaning_words(text: str) -> str:
    """ Apply all of cleaning process to text data
    """
    tmp_text = underscore_to_space(text)
    tmp_text = no_char(tmp_text)
    tmp_text = preprocess_text(tmp_text)
    tmp_text = no_multi_spaces(tmp_text)
    return tmp_text


def load_pkl(filepath: str) -> Any:
    """ Load pickle file

    Examples:
        filepath = './dataset_class/data_folder/train.pkl'
    """
    with open(f'{filepath}', 'rb') as file:
        output = pickle.load(file)
    return output


def load_json(filepath: str) -> pd.DataFrame:
    """ Load json file

    Examples:
        filepath = './dataset_class/data_folder/train.json'
    """
    output = pd.read_json(filepath)
    return output


def load_parquet(filepath: str) -> pd.DataFrame:
    """ Load parquet file

    Examples:
        filepath = './dataset_class/data_folder/train.parquet'
    """
    output = pd.read_parquet(filepath)
    return output


def load_csv(filepath: str) -> pd.DataFrame:
    """ Load csv file

    Examples:
        filepath = './dataset_class/data_folder/train.csv'
    """
    output = pd.read_csv(filepath)
    return output


def load_all_types_dataset(path: str) -> pd.DataFrame:
    """ Load all pickle files from folder

    Args:
        path: path in your local directory

    Examples:
        load_all_types_dataset('./data_folder/squad2/train.json')
        load_all_types_dataset('./data_folder/yahoo_qa/test.csv')
        load_all_types_dataset('./data_folder/yelp_review/train_0.parquet')

    All of file types are supported: json, csv, parquet, pkl
    And Then, they are converted to dict type in python
    """
    output = None
    file_types = path.split('.')[-1]
    if file_types == 'pkl': output = load_pkl(path)
    elif file_types == 'json': output = load_json(path)
    elif file_types == 'parquet': output = load_parquet(path)
    elif file_types == 'csv': output = load_csv(path)
    
    return output

In [17]:
""" Load Dataset """

DATA_PATH = './data_folder/'
DATA_NAME = 'yelp_review'
TYPE = 'train.parquet'

df = load_all_types_dataset(f'{DATA_PATH}{DATA_NAME}/{TYPE}')
df

In [18]:
""" cleaning text data in dataframe """

df['text'] = df['text'].apply(cleaning_words)
df, df.text[1]

In [47]:
""" Amazon Review Dataset """

""" Load Dataset """

DATA_PATH = './data_folder/'
DATA_NAME = 'amazon_review'
TYPE = 'beauty.json'

df = load_all_types_dataset(f'{DATA_PATH}{DATA_NAME}/{TYPE}')
df

In [27]:
""" calculate length of each text data, check statistics value of train text data """

lengths = sequence_length(CFG, df['text'])

print('------------- Length Statistic Info -------------')
print('Max Length of Sentence : {}'.format(np.max(lengths)))
print('Min Length of Sentence : {}'.format(np.min(lengths)))
print('Mean Length of Sentence : {:.2f}'.format(np.mean(lengths)))
print('Std Length of Sentence : {:.2f}'.format(np.std(lengths)))
print('Median Length of Sentence : {}'.format(np.median(lengths)))
print('Q1 Length of Sentence : {}'.format(np.percentile(lengths, 25)))
print('Q3 Length of Sentence : {}'.format(np.percentile(lengths, 75)))

In [28]:
""" Box plot of length of text data"""

sns.set_style(style='dark')
plt.figure(figsize=(15,10))

plt.boxplot(lengths, labels=['count'], showmeans=True) 

In [29]:
""" log scale hist plot """

sns.set_style(style='dark')
plt.figure(figsize=(15,10))

plt.hist(lengths, bins=30, alpha=0.5, color='blue', label='tokens')
plt.yscale('log')
plt.title("Log-Histplot of Text length", fontsize=20)
plt.xlabel("length of tokens", fontsize=16)
plt.ylabel("number of texts", fontsize=16)

In [31]:
""" word cloud plot """

cloud = WordCloud(width=800, height=600).generate(" ".join(df['text']))
plt.figure(figsize=(15,10))
plt.imshow(cloud)
plt.axis('off') 

In [33]:
""" plot for rating distribution """

rating_count = df['label'].value_counts()
rating_count

In [35]:
""" co-relation between length of review and rating """

labels = df.label.to_list()
corr_df = pd.DataFrame()
corr_df['label'], corr_df['length'] = labels, lengths
measure_corr = corr_df.corr() 

plt.figure(figsize=(10,10))
sns.heatmap(
    measure_corr, 
    xticklabels=measure_corr.columns, 
    yticklabels=measure_corr.columns, 
    square=True,
    annot=True, 
    cmap="coolwarm",
    fmt=".2f"
)

In [48]:
def jsonl_to_json(jsonl_file: str, json_file: str) -> None:
    """ Convert jsonl file to json file
    
    Args:
        jsonl_file: input jsonl file path
        json_file: output json file path, which is converted from jsonl file
        
    Examples:
        jsonl_to_json('./data_folder/amazon_review/beauty.jsonl', './data_folder/amazon_review/beauty.json')
        
    """
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        jsonl_data = f.readlines()

    json_data = [json.loads(line.strip()) for line in jsonl_data]
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=4)

jsonl_to_json('./data_folder/amazon_review/beauty.jsonl', './data_folder/amazon_review/beauty.json')

In [4]:
""" Load Amazon Review Dataset """

DATA_PATH = './data_folder/'
DATA_NAME = 'amazon_review'
TYPE = 'beauty.json'

df = load_all_types_dataset(f'{DATA_PATH}{DATA_NAME}/{TYPE}')
df

In [11]:
""" calculate length of each text data, check statistics value of train text data """

lengths = sequence_length(CFG, df['text'])
eda.print_length_stats_of_text(lengths)

In [12]:
""" Box plot of length of text data """

eda.token_length_box_plot(lengths)

In [13]:
""" log scale hist plot """

eda.log_scale_token_length_plot(lengths)

Must use longer sequence for LLM model, because of the limitation of the maximum input length of the model
such as llama, deberta, bigbird, longformer and so on