In [67]:
import sentencepiece
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoConfig, AutoTokenizer

In [66]:
class CFG:
    model = 'microsoft/deberta-v3-large'
    tokenizer = AutoTokenizer.from_pretrained(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [101]:
""" Dataset Utils Function """

def markdown_to_text(markdown_string: str) -> str:
    """
    Converts a markdown string to plaintext by beautifulsoup
    md -> html -> string
    Args:
        markdown_string: str, markdown string
    Example:
        markdown_to_text(md.loc['63a93277', 'source'])
        => md == pd.DataFrame filtered by cell_type == 'markdown'
    Reference:
        https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
    """
    try:
        html = markdown.markdown(markdown_string)
        html = re.sub(r'<pre>(.*?)</pre>', ' ', html)  # remove code snippets
        html = re.sub(r'<code>(.*?)</code >', ' ', html)  # remove code snippets
        soup = BeautifulSoup(html, "html.parser")  # extract text
        text = ''.join(soup.findAll(text=True)).strip()  # extract text
        if len(text) == 0:
            text = markdown_string
            if text[0] == "!" and text[1] == "[":
                for m in range(2, len(text)):
                    if text[m] == "]":
                        text = 'embedded ' + text[2:m] + ' image'
                        break
            elif '<img src' in markdown_string or '.png' in markdown_string or 'gif' in markdown_string or '.jpg' in markdown_string:
                text = 'embedded image'
    except:
        text = markdown_string
    return text

def code_tokenizer(code: str) -> str:
    """
    Tokenize code text by python built-in tokenizer for code scanning
    Args:
        code: str, code text
    Example:
        code = code.loc['3a6623e3','source']
        code_text = tokenize.generate_tokens(io.StringIO(code).readline)
        ' '.join([tok.string for tok in code_text if tok.type==1 or tok.type==2 or tok.type==3 or tok.type==60])
    Reference:
        https://docs.python.org/3/library/tokenize.html
        https://www.kaggle.com/code/haithamaliryan/ai4code-extract-all-functions-variables-names/notebook
    """
    try:
        code_text = tokenize.generate_tokens(io.StringIO(code).readline)
        code_str = ' '.join([tok.string for tok in code_text if tok.type == 1 or tok.type == 2 or tok.type == 3 or tok.type == 60])
        if len(code_str) == 0:
            code_str = "unknown"
    except:
        code_str = code
    return code_str

def tokenizing(cfg: CFG, text: str) -> any:
    """
    Preprocess text for CLIP
    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type
    """
    inputs = cfg.tokenizer(
        text,
        max_length=cfg.max_len,
        padding='max_length',
        truncation=True,
        return_tensors=None,
        add_special_tokens=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.as_tensor(v)
    return inputs

def sequence_length(cfg: CFG, text_list: list) -> list:
    """ Get sequence length of all text data for checking statistics value """
    length_list = []
    for text in text_list:
        tmp_text = tokenizing(cfg, text)['attention_mask']
        length_list.append(tmp_text.count(1))
    return length_list

def group_kfold(df: pd.DataFrame, cfg: CFG) -> pd.DataFrame:
    """ GroupKFold """
    fold = GroupKFold(
        n_splits=cfg.n_folds,
    )
    df['fold'] = -1
    for num, (tx, vx) in enumerate(fold.split(X=df, y=df['pct_rank'], groups=df['ancestor_id'])):
        df.loc[vx, "fold"] = int(num)
    return df

def check_null(df: pd.DataFrame) -> pd.Series:
    """ check if input dataframe has null type object...etc """
    return df.isnull().sum()

def drop_columns(df: pd.DataFrame, drop_list: list[int]) -> pd.DataFrame:
    """
    drop columns by drop_list
    Args:
        drop_list: list type, element is column index in dataframe which do you want to drop
    example:
        drop_list = [0, 1, 2]
    """
    tmp_index = [i for i in range(len(df.columns))]
    for idx in drop_list.sort():
        tmp_index.remove(idx)
    pass

In [57]:
"""
ancestor_id, parent_id 모두 지워도 될 것 같다. 어차피 test dataset에는 등장 안해서 context로 사용할 수 없음
"""
df = pd.read_csv(
    'train.csv',
    keep_default_na=False
)
df

Unnamed: 0,id,cell_id,cell_type,source
0,8a2564b730a575,8395ab7c,code,import numpy as np\nimport matplotlib.pyplot a...
1,8a2564b730a575,ebc844d6,code,df_train = pd.read_csv('../input/tensorflow-gr...
2,8a2564b730a575,49251f17,code,"def bbox_inv_iou(boxA, boxB):\n """"""Copied f..."
3,8a2564b730a575,3a6623e3,code,test_sequence_id = np.unique(df_train.sequence...
4,8a2564b730a575,24e09d1a,code,"seq_df_with_cots_ids, stats = find_unique_cots..."
...,...,...,...,...
6370641,a3faba2871daaa,5ce07f7e,markdown,Lets check a random image and its label
6370642,a3faba2871daaa,ad67d1e9,markdown,We can see that there are 5 directories of 5 d...
6370643,a3faba2871daaa,c24be090,markdown,## Also print Misclassified Images:
6370644,a3faba2871daaa,df63943f,markdown,**Normalization**


In [99]:
test = [1,2,3]


[1, 2, 3]

In [82]:
df.isnull().sum()

id           0
cell_id      0
cell_type    0
source       0
dtype: int64

In [12]:
# def tokenizing(text: str) -> dict:
#     inputs = tokenizer.encode_plus(
#         text,
#         max_length=512,
#         padding='max_length',
#         truncation=True,
#         return_tensors=None,
#         add_special_tokens=False,
#     )
#     return inputs

# def sequence_length(text_list: list) -> list:
#     length_list = []
#     for text in text_list:
#         tmp_text = tokenizing(text)['attention_mask']
#         length_list.append(tmp_text.count(1))
#     return length_list

In [None]:
"""
1) Apply markdown_to_text, code_tokenizer (o)
2) Check sequence length 
3) Add Rank Feature, ancestor_id (o)
    => kaggle notebook에서 만들어서 업로드
4) Apply Cross Validation (o)
    => drop ancestor, parent feature 
5) Convert DataFrame shape to dictionary shape
    - A: 0, A: 1, A: 2..... 
    - A: [0, 1, 2, 3, ....]
"""
""" Apply code & markdown tokenizing by custom function """
for i in tqdm(range(len(df))):
    if df.iloc[i, 2] == 'markdown':
        df.iloc[i, 3] = markdown_to_text(df.iloc[i, 3])
    else:
        df.iloc[i, 3] = code_tokenizer(df.iloc[i, 3])
df

In [13]:
sentence_list = df.source.to_list()
length = sequence_length(sentence_list)
print(sum(sequence_length) / 6370651)

0
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname _ filenames in os walk '/kaggle/input' for filename in filenames print os path join dirname filename # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"  # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
1
import numpy as np import pandas as pd import random from sklearn model_selection import 

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [8]:
print(sentence_list[0])

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname _ filenames in os walk '/kaggle/input' for filename in filenames print os path join dirname filename # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"  # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
