In [None]:
!pip install transformers > /dev/null

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer
# from tqdm import tqdm
# tqdm.pandas()

DRIVE_PATH = '/content/drive/MyDrive/atmacup10/'

train = pd.read_csv(DRIVE_PATH+'features/train.csv')
test = pd.read_csv(DRIVE_PATH+'features/test.csv')

# textsの言語判定
texts_lang = pd.read_csv(DRIVE_PATH+'features/texts_lang.csv')

train_test = pd.concat([train, test], ignore_index=True)
train_test = pd.concat([train_test, texts_lang], axis=1)

    
for c in ['title', 'description', 'long_title']:
    _train_test = train_test.copy()

    en_text_df = _train_test[_train_test[f'{c}_lang']=='en']
    en_text_idx = _train_test[_train_test[f'{c}_lang']=='en'].index
    _train_test = _train_test.drop(_train_test.index[en_text_idx]).reset_index(drop=True)

    nl_text_df = _train_test[_train_test[f'{c}_lang']=='nl']
    nl_text_idx = _train_test[_train_test[f'{c}_lang']=='nl'].index
    _train_test = _train_test.drop(_train_test.index[nl_text_idx]).reset_index(drop=True)

    ex_text_df = _train_test

    print(en_text_df.shape, nl_text_df.shape, ex_text_df.shape)

    for _df, name in zip([en_text_df, nl_text_df, ex_text_df], ['en', 'nl', 'ex']):
        out_df = pd.DataFrame({
            'object_id': _df['object_id'],
            f'{c}': _df[c]
        })
        out_df.to_csv(DRIVE_PATH+f'features/texts_lang/{c}_{name}.csv')

(4670, 22) (18135, 22) (1229, 22)
(7218, 22) (16723, 22) (93, 22)
(5851, 22) (17414, 22) (769, 22)


# English tokenizer

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer

DRIVE_PATH = '/content/drive/MyDrive/atmacup10/'

class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'bert-base-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128


    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()


BSV = BertSequenceVectorizer()

for c in ['title', 'description', 'long_title']:
    texts = pd.read_csv(DRIVE_PATH+f'features/texts_lang/{c}_en.csv')

    texts[c] = texts[c].fillna('NaN') # null埋め
    texts[f'{c}_feature'] = texts[c].apply(lambda x: BSV.vectorize(x))

    out_df = pd.DataFrame({'object_id': texts['object_id']})

    tmp = np.stack(texts[f'{c}_feature'])
    _df = pd.DataFrame(tmp, columns=[f'{c}_bert_feature{i}' for i in range(tmp.shape[-1])])
    out_df = pd.concat([out_df, _df], axis=1)

    out_df.to_csv(DRIVE_PATH+f'features/texts_lang/{c}_en_feature.csv')

# オランダ語 ベクタライザー

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer

DRIVE_PATH = '/content/drive/MyDrive/atmacup10/'

class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'wietsedv/bert-base-dutch-cased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128


    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()


BSV = BertSequenceVectorizer()

for c in ['title', 'description', 'long_title']:
    texts = pd.read_csv(DRIVE_PATH+f'features/texts_lang/{c}_nl.csv')

    texts[c] = texts[c].fillna('NaN') # null埋め
    texts[f'{c}_feature'] = texts[c].apply(lambda x: BSV.vectorize(x))

    out_df = pd.DataFrame({'object_id': texts['object_id']})

    tmp = np.stack(texts[f'{c}_feature'])
    _df = pd.DataFrame(tmp, columns=[f'{c}_bert_feature{i}' for i in range(tmp.shape[-1])])
    out_df = pd.concat([out_df, _df], axis=1)
    
    out_df.to_csv(DRIVE_PATH+f'features/texts_lang/{c}_nl_feature.csv')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241440.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438869143.0, style=ProgressStyle(descri…




Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors


# その他 ベクタライザー

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer

DRIVE_PATH = '/content/drive/MyDrive/atmacup10/'

class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'bert-base-multilingual-cased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128


    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()


BSV = BertSequenceVectorizer()

for c in ['title', 'description', 'long_title']:
    texts = pd.read_csv(DRIVE_PATH+f'features/texts_lang/{c}_ex.csv')

    texts[c] = texts[c].fillna('NaN') # null埋め
    texts[f'{c}_feature'] = texts[c].apply(lambda x: BSV.vectorize(x))

    out_df = pd.DataFrame({'object_id': texts['object_id']})

    tmp = np.stack(texts[f'{c}_feature'])
    _df = pd.DataFrame(tmp, columns=[f'{c}_bert_feature{i}' for i in range(tmp.shape[-1])])
    out_df = pd.concat([out_df, _df], axis=1)
    
    out_df.to_csv(DRIVE_PATH+f'features/texts_lang/{c}_ex_feature.csv')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


