In [1]:
import torch
import re, gc, glob, io, tokenize, markdown
import sentencepiece
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoConfig, AutoTokenizer
from tqdm.auto import tqdm

In [2]:
class CFG:
    model = 'microsoft/deberta-v3-large'
    tokenizer = AutoTokenizer.from_pretrained(model)
    max_len = 4098

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
""" Dataset Utils Function """

def markdown_to_text(markdown_string: str) -> str:
    """
    Converts a markdown string to plaintext by beautifulsoup
    md -> html -> string
    Args:
        markdown_string: str, markdown string
    Example:
        markdown_to_text(md.loc['63a93277', 'source'])
        => md == pd.DataFrame filtered by cell_type == 'markdown'
    Reference:
        https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
    """
    try:
        html = markdown.markdown(markdown_string)
        html = re.sub(r'<pre>(.*?)</pre>', ' ', html)  # remove code snippets
        html = re.sub(r'<code>(.*?)</code >', ' ', html)  # remove code snippets
        soup = BeautifulSoup(html, "html.parser")  # extract text
        text = ''.join(soup.findAll(text=True)).strip()  # extract text
        if len(text) == 0:
            text = markdown_string
            if text[0] == "!" and text[1] == "[":
                for m in range(2, len(text)):
                    if text[m] == "]":
                        text = 'embedded ' + text[2:m] + ' image'
                        break
            elif '<img src' in markdown_string or '.png' in markdown_string or 'gif' in markdown_string or '.jpg' in markdown_string:
                text = 'embedded image'
    except:
        text = markdown_string
    return text

def code_tokenizer(code: str) -> str:
    """
    Tokenize code text by python built-in tokenizer for code scanning
    Args:
        code: str, code text
    Example:
        code = code.loc['3a6623e3','source']
        code_text = tokenize.generate_tokens(io.StringIO(code).readline)
        ' '.join([tok.string for tok in code_text if tok.type==1 or tok.type==2 or tok.type==3 or tok.type==60])
    Reference:
        https://docs.python.org/3/library/tokenize.html
        https://www.kaggle.com/code/haithamaliryan/ai4code-extract-all-functions-variables-names/notebook
    """
    try:
        code_text = tokenize.generate_tokens(io.StringIO(code).readline)
        code_str = ' '.join([tok.string for tok in code_text if tok.type == 1 or tok.type == 2 or tok.type == 3 or tok.type == 60])
        if len(code_str) == 0:
            code_str = "unknown"
    except:
        code_str = code
    return code_str

def tokenizing(cfg: CFG, text: str) -> any:
    """
    Preprocess text for CLIP
    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type
    """
    inputs = cfg.tokenizer(
        text,
        max_length=cfg.max_len,
        padding='max_length',
        truncation=False,
        return_tensors=None,
        add_special_tokens=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.as_tensor(v)
    return inputs

def sequence_length(cfg: CFG, text_list: list) -> list:
    """ Get sequence length of all text data for checking statistics value """
    length_list = []
    for text in tqdm(text_list):
        tmp_text = tokenizing(cfg, text)['attention_mask']
        length_list.append(tmp_text[tmp_text == 1].shape[0])
    return length_list

def group_kfold(df: pd.DataFrame, cfg: CFG) -> pd.DataFrame:
    """ GroupKFold """
    fold = GroupKFold(
        n_splits=cfg.n_folds,
    )
    df['fold'] = -1
    for num, (tx, vx) in enumerate(fold.split(X=df, y=df['pct_rank'], groups=df['ancestor_id'])):
        df.loc[vx, "fold"] = int(num)
    return df

def check_null(df: pd.DataFrame) -> pd.Series:
    """ check if input dataframe has null type object...etc """
    return df.isnull().sum()

def drop_columns(df: pd.DataFrame, drop_list: list[int]) -> pd.DataFrame:
    """
    drop columns by drop_list
    Args:
        drop_list: list type, element is column index in dataframe which do you want to drop
    example:
        drop_list = [0, 1, 2]
    """
    tmp_index = [i for i in range(len(df.columns))]
    for idx in drop_list.sort():
        tmp_index.remove(idx)
    pass

In [15]:
"""
ancestor_id, parent_id 모두 지워도 될 것 같다. 어차피 test dataset에는 등장 안해서 context로 사용할 수 없음
"""
df = pd.read_csv(
    'all_train_df.csv',
    keep_default_na=False
)
df

  df = pd.read_csv(


Unnamed: 0,id,cell_id,cell_type,source,rank,pct_rank,fold
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many he...,0,0.0,0
1,00001756c60be8,2a9e43d6,code,import numpy as np\nimport pandas as pd\nimpor...,2,0.034483,0
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('igno...,4,0.068966,0
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),6,0.103448,0
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pr...",8,0.137931,0
...,...,...,...,...,...,...,...
6370645,fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS,43,0.597222,4
6370646,fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE,33,0.458333,4
6370647,fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION,0,0.0,4
6370648,fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predictin...,69,0.958333,4


In [16]:
df.isnull().sum()

id           0
cell_id      0
cell_type    0
source       0
rank         0
pct_rank     0
fold         0
dtype: int64

In [12]:
# def tokenizing(text: str) -> dict:
#     inputs = tokenizer.encode_plus(
#         text,
#         max_length=512,
#         padding='max_length',
#         truncation=True,
#         return_tensors=None,
#         add_special_tokens=False,
#     )
#     return inputs

# def sequence_length(text_list: list) -> list:
#     length_list = []
#     for text in text_list:
#         tmp_text = tokenizing(text)['attention_mask']
#         length_list.append(tmp_text.count(1))
#     return length_list

In [17]:
"""
1) Apply markdown_to_text, code_tokenizer (o)
2) Check sequence length (issue, too long) 
3) Add Rank Feature, ancestor_id (o)
    => kaggle notebook에서 만들어서 업로드
4) Apply Cross Validation (o)
    => drop ancestor, parent feature 
5) Convert DataFrame shape to dictionary shape (issue)
    - A: 0, A: 1, A: 2..... 
    - A: [0, 1, 2, 3, ....]
    - groupby function 사용해서 리스트 형태로 묶어주기
"""
""" Apply code & markdown tokenizing by custom function """
for i in tqdm(range(len(df))):
    if df.iloc[i, 2] == 'markdown':
        df.iloc[i, 3] = markdown_to_text(df.iloc[i, 3])
    else:
        df.iloc[i, 3] = code_tokenizer(df.iloc[i, 3])
df

  0%|          | 0/6370650 [00:00<?, ?it/s]

Unnamed: 0,id,cell_id,cell_type,source,rank,pct_rank,fold
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many he...,0,0.0,0
1,00001756c60be8,2a9e43d6,code,import numpy as np import pandas as pd import ...,2,0.034483,0
2,00001756c60be8,038b763d,code,import warnings warnings filterwarnings 'ignore',4,0.068966,0
3,00001756c60be8,2eefe0ef,code,matplotlib rcParams update 'font.size' 14,6,0.103448,0
4,00001756c60be8,0beab1cd,code,def evaluate_preds train_true_values train_pre...,8,0.137931,0
...,...,...,...,...,...,...,...
6370645,fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS,43,0.597222,4
6370646,fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE,33,0.458333,4
6370647,fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION,0,0.0,4
6370648,fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predictin...,69,0.958333,4


In [4]:
df = pd.read_csv(
    'tmp_final_train_df.csv',
    keep_default_na=False
)
df

  df = pd.read_csv(


Unnamed: 0,id,cell_id,cell_type,source,rank,pct_rank,fold
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many he...,0,0.0,0
1,00001756c60be8,2a9e43d6,code,import numpy as np import pandas as pd import ...,2,0.034483,0
2,00001756c60be8,038b763d,code,import warnings warnings filterwarnings 'ignore',4,0.068966,0
3,00001756c60be8,2eefe0ef,code,matplotlib rcParams update 'font.size' 14,6,0.103448,0
4,00001756c60be8,0beab1cd,code,def evaluate_preds train_true_values train_pre...,8,0.137931,0
...,...,...,...,...,...,...,...
6370646,fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS,43,0.597222,4
6370647,fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE,33,0.458333,4
6370648,fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION,0,0.0,4
6370649,fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predictin...,69,0.958333,4


In [5]:
df.isnull().sum()

id           0
cell_id      0
cell_type    0
source       0
rank         0
pct_rank     0
fold         0
dtype: int64

In [None]:
""" Check Sequence Length Statistic """
sentence_list = df.source.to_list()
length = sequence_length(CFG, sentence_list)
print(f'mean sequence length: {sum(sequence_length) / 6370651}')
print(f'max sequence length: {max(length)}')
print(f'min sequence length: {min(length)}')

In [14]:
test = torch.tensor([1,1,1,1,1,0,0])
test[test == 1].shape[0]

5