In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/competitions/probspace_pricing/notebooks/

/content/drive/MyDrive/competitions/probspace_pricing/notebooks


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 22.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [4]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer
from tqdm import tqdm
tqdm.pandas()

# 前処理関数

In [5]:
def preprocess_text(df, col):
    # nanを"NAN"に置換
    df = df.replace({col: {np.nan: 'NAN'}})
    return df

In [6]:
def preprocess_amenities(df, col):
    # 不要な文字を削除（Amenitiesにつかう）
    df[col] = df[col].apply(lambda x: x.replace("[", "").replace('"', "").replace(']', ""))

    return df

In [7]:
def preprocess_pt_am_bt(df):
    # 連結する
    text_list = []
    for i in range(len(df)):
        text = "PropertyType is " + df["PropertyType"][i]  + ". Amenities are " + df["Amenities"][i] + ". Bathroom is " + df["BathroomsText"][i]
        text_list.append(text)
    df["conbine_PT_Am_BT"] = np.array(text_list)

    return df

# 分散表現クラス

In [8]:
class BertSequenceVectorizer:
    def __init__(self, model_name):
        self.device = 'cuda' # 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 512


    def vectorize(self, sentence : str) -> np.array:

        # 1.&2.トークナイズを行って、単語のベクトル化を行う
        # ※トークン以外にもトークンタイプとアテンションマスクの情報も付与されている
        input = self.tokenizer(sentence, return_tensors="pt", max_length=self.max_len)
        # GPUに乗せる
        input["input_ids"] = input["input_ids"].to("cuda")
        input["token_type_ids"] = input["token_type_ids"].to("cuda")
        input["attention_mask"] = input["attention_mask"].to("cuda")

        with torch.no_grad(): # 計算の途中結果が保存されなくなる
            # BERTに通す
            outputs = self.bert_model(**input) # 引数に**を付与すると、キーとバリューがキーワード引数とその値となる
            # BERTの最終層を取り出す
            last_hidden_states = outputs.last_hidden_state

            # マスクを取り出す
            attention_mask = input.attention_mask.unsqueeze(-1)
            # [PAD]を除いたトークン数
            valid_token_num = attention_mask.sum(1)

            # 平均値を算出:BERTから出力されたベクトルを平均する場合
            # attention_maskは[PAD]トークンの時に0を返すので、単語ベクトルとの積を取ると[PAD]分が除かれる
            sentence_vec = (last_hidden_states*attention_mask).sum(1) / valid_token_num

            # 平均値を算出:トークン列の特殊トークン[CLS]に対応するベクトルを使用する場合
            # sentence_vec = last_hidden_states[0][0]
        
            # GPU利用時
            sentence_vec = sentence_vec.detach().cpu().numpy()[0]

        return sentence_vec

# 次元削減する関数

In [9]:
def decompose_func(df, col, n_components):

    # 次元削減
    model = TruncatedSVD(n_components=n_components, random_state=123)
    decomponents_array = model.fit_transform(np.vstack(df[col]))

    # dfに格納
    decomponents_df = pd.DataFrame(decomponents_array)
    # カラム名をつける
    decomponents_df.columns = ["bert_svd_{}_{}".format(col, i) for i in range(decomponents_df.shape[1])]

    return decomponents_df

#分散表現＋次元削減をする関数

In [10]:
from sklearn.decomposition import PCA, TruncatedSVD, NMF

In [11]:
def vect_decompose(all_df, col, model_name, n_components=20):

    df = all_df.copy()

    # 分散表現
    BSV = BertSequenceVectorizer(model_name=model_name)
    df[col] = df[col].apply(lambda x: BSV.vectorize(x))

    # 次元削減
    decomponents_df = decompose_func(df, col, n_components=n_components)

    # merge
    df = pd.merge(all_df["ID"], decomponents_df, left_index=True, right_index=True)

    return df

# main

In [12]:
raw_path = "/content/drive/MyDrive/competitions/probspace_pricing/data/raw/"
features_path = "/content/drive/MyDrive/competitions/probspace_pricing/data/features/"

In [13]:
# データの読み込み
train_df = pd.read_csv(raw_path + "train.csv")
test_df = pd.read_csv(raw_path + "test.csv")
review_df = pd.read_csv(raw_path + "review.csv")

all_df = pd.concat([train_df, test_df]).reset_index()

In [14]:
col_list1 = ["OwnerDetail","Description"] 
col_list2 = ["PropertyType","Amenities","BathroomsText"]

OwnerDetail Description

In [17]:
# 多言語
for col in col_list1:
    
    print("【{} start】".format(col))

    model_name = 'bert-base-uncased' # 'bert-base-multilingual-cased'

    # NaNを変換
    all_df = preprocess_text(all_df, col)

    # 分散表現＋SVD
    decompose_df = vect_decompose(all_df, col, model_name=model_name, n_components=20)

    # pickleファイルに格納
    decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name))

【OwnerDetail start】


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

【Description start】


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

PropertyType Amenities BathroomsText

In [None]:
# 前処理
all_df = preprocess_amenities(all_df, "Amenities")

In [None]:
# 英語
for col in col_list2:
    
    print("【{} start】".format(col))

    # NaNを変換
    all_df = preprocess_text(all_df, col)

    # 分散表現＋SVD
    decompose_df = vect_decompose(all_df, col, model_name='bert-base-uncased', n_components=20)

    # pickleファイルに格納
    decompose_df.to_pickle(features_path + "{}_vect_svd.pkl".format(col))

PropertyType start


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Amenities start


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

BathroomsText start


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Review

In [19]:
col = "Review"

model_name = 'bert-base-uncased'

# NaNを変換
review_df = preprocess_text(review_df, col)

# 分散表現＋SVD
decompose_df = vect_decompose(review_df, col, model_name=model_name, n_components=20)

# csvに格納
decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

conbine_PT_Am_BT

In [None]:
# NaNを変換
all_df = preprocess_text(all_df, "BathroomsText")

col = "conbine_PT_Am_BT"

model_name = 'bert-base-uncased'

# 前処理
all_df = preprocess_pt_am_bt(all_df)

# 分散表現＋SVD
decompose_df = vect_decompose(all_df, col, model_name=model_name, n_components=20)

# pickleファイルに格納
decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name))