In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/competitions/probspace_pricing/notebooks/

/content/drive/MyDrive/competitions/probspace_pricing/notebooks


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 31.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 82.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [4]:
import pandas as pd
import numpy as np
import torch
import transformers

from transformers import BertTokenizer
from tqdm import tqdm
tqdm.pandas()

# 前処理関数

In [5]:
def preprocess_text(df, col):
    # nanを"NAN"に置換
    df = df.replace({col: {np.nan: 'NAN'}})
    return df

In [6]:
def preprocess_amenities(df, col):
    # 不要な文字を削除（Amenitiesにつかう）
    df[col] = df[col].apply(lambda x: x.replace("[", "").replace('"', "").replace(']', ""))

    return df

In [7]:
def preprocess_pt_am(df):
    # 連結する
    text_list = []
    for i in range(len(df)):
        text = "PropertyType is " + df["PropertyType"][i]  + ". Amenities are " + df["Amenities"][i]
        text_list.append(text)
    df["conbine_PT_Am"] = np.array(text_list)

    return df

In [8]:
def preprocess_pt_am_bt(df):
    # 連結する
    text_list = []
    for i in range(len(df)):
        text = "PropertyType is " + df["PropertyType"][i]  + ". Amenities are " + df["Amenities"][i] + ". Bathroom is " + df["BathroomsText"][i]
        text_list.append(text)
    df["conbine_PT_Am_BT"] = np.array(text_list)

    return df

In [27]:
def preprocess_review(df):

    df["Review"] = df[["ReviewerName", "Review"]].apply(lambda x: "I am " + x[0] + ". " + x[1] + ".", axis=1)

    return df

# 分散表現クラス

In [9]:
class BertSequenceVectorizer:
    def __init__(self, model_name):
        self.device = 'cuda' # 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 512


    def vectorize(self, sentence : str) -> np.array:

        # 1.&2.トークナイズを行って、単語のベクトル化を行う ※トークン以外にもトークンタイプとアテンションマスクの情報も付与されている
        input = self.tokenizer(sentence, return_tensors="pt", max_length=self.max_len)
        # GPUに乗せる
        input["input_ids"] = input["input_ids"].to("cuda")
        input["token_type_ids"] = input["token_type_ids"].to("cuda")
        input["attention_mask"] = input["attention_mask"].to("cuda")

        with torch.no_grad(): # 計算の途中結果が保存されなくなる
            # BERTに通す
            outputs = self.bert_model(**input) # 引数に**を付与すると、キーとバリューがキーワード引数とその値となる
            # BERTの最終層を取り出す
            last_hidden_states = outputs.last_hidden_state

            # マスクを取り出す
            attention_mask = input.attention_mask.unsqueeze(-1)
            # [PAD]を除いたトークン数
            valid_token_num = attention_mask.sum(1)

            # 平均値を算出:BERTから出力されたベクトルを平均する場合
            # attention_maskは[PAD]トークンの時に0を返すので、単語ベクトルとの積を取ると[PAD]分が除かれる
            sentence_vec = (last_hidden_states*attention_mask).sum(1) / valid_token_num

            # 平均値を算出:トークン列の特殊トークン[CLS]に対応するベクトルを使用する場合
            # sentence_vec = last_hidden_states[0][0]
        
            # GPU利用時
            sentence_vec = sentence_vec.detach().cpu().numpy()[0]

        return sentence_vec

# 分散表現クラス（BertClasification)

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

In [11]:
class BertClassificationSequenceVectorizer:
    def __init__(self, tokenizer_name, model_name, max_len, add_special_tokens):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.bert_model = AutoModelForSequenceClassification.from_pretrained(model_name,  output_hidden_states=True)
        self.device = 'cuda' # 'cuda' if torch.cuda.is_available() else 'cpu'
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len
        self.add_special_tokens = add_special_tokens


    def vectorize(self, sentence : str) -> np.array:

        # トークナイズ ※トークン以外にもトークンタイプとアテンションマスクの情報も付与されている
        input = self.tokenizer(sentence,
                               return_tensors="pt",
                               truncation=True,
                               padding=True,
                               max_length=self.max_len,
                               add_special_tokens=self.add_special_tokens, # スペシャルトークンの付与,tokenが０になった時でも動く
                               )
        # GPUに乗せる
        input = input.to("cuda")

        with torch.no_grad(): # 計算の途中結果が保存されなくなる
            # BERTに通す
            outputs = self.bert_model(**input) # 引数に**を付与すると、キーとバリューがキーワード引数とその値となる
            # BERTの最終層を取り出す
            last_hidden_states = outputs[1][-1]
            # マスクを取り出す
            attention_mask = input.attention_mask.unsqueeze(-1)
            # [PAD]を除いたトークン数
            valid_token_num = attention_mask.sum(1)

            # 平均値を算出:BERTから出力されたベクトルを平均する場合
            # attention_maskは[PAD]トークンの時に0を返すので、単語ベクトルとの積を取ると[PAD]分が除かれる
            sentence_vec = (last_hidden_states*attention_mask).sum(1) / valid_token_num

            # 平均値を算出:トークン列の特殊トークン[CLS]に対応するベクトルを使用する場合
            # sentence_vec = last_hidden_states[0][0]
        
            # GPU利用時
            sentence_vec = sentence_vec.detach().cpu().numpy()[0]

        return sentence_vec

# 次元削減する関数

In [12]:
def decompose_func(df, col, n_components):

    # 次元削減
    model = TruncatedSVD(n_components=n_components, random_state=123)
    decomponents_array = model.fit_transform(np.vstack(df[col]))

    # dfに格納
    decomponents_df = pd.DataFrame(decomponents_array)
    # カラム名をつける
    decomponents_df.columns = ["bert_svd_{}_{}".format(col, i) for i in range(decomponents_df.shape[1])]

    return decomponents_df

#分散表現＋次元削減をする関数

In [13]:
from sklearn.decomposition import PCA, TruncatedSVD, NMF

In [14]:
def vect_decompose(all_df, col, model_name, n_components=20):

    df = all_df.copy()

    # 分散表現
    BSV = BertSequenceVectorizer(model_name=model_name)
    df[col] = df[col].apply(lambda x: BSV.vectorize(x))

    # 次元削減
    decomponents_df = decompose_func(df, col, n_components=n_components)

    # merge
    df = pd.merge(all_df["ID"], decomponents_df, left_index=True, right_index=True)

    return df

# main

In [15]:
raw_path = "/content/drive/MyDrive/competitions/probspace_pricing/data/raw/"
features_path = "/content/drive/MyDrive/competitions/probspace_pricing/data/features/"
bert_reg_path = "/content/drive/MyDrive/competitions/probspace_pricing/notebooks/bert_reg/"

In [16]:
# データの読み込み
train_df = pd.read_csv(raw_path + "train.csv")
test_df = pd.read_csv(raw_path + "test.csv")
review_df = pd.read_csv(raw_path + "review.csv")

all_df = pd.concat([train_df, test_df]).reset_index()

In [17]:
all_df["OwnerDetail"] = pd.read_pickle(bert_reg_path + "df_checkpoint_OwnerDetail")["OwnerDetail"]
all_df["Description"] = pd.read_pickle(bert_reg_path + "df_checkpoint_Description")["Description"]
review_df = pd.read_pickle(bert_reg_path + "df_checkpoint_Review")

In [18]:
col_list1 = ["OwnerDetail","Description"] 
col_list2 = ["PropertyType","Amenities","BathroomsText"]

## BertModel

OwnerDetail Description

In [None]:
#　多言語
for col in col_list1:
    
    print("【{} start】".format(col))

    model_name = 'bert-base-uncased' # 'bert-base-multilingual-cased'

    # NaNを変換
    all_df = preprocess_text(all_df, col)

    # 分散表現＋SVD
    decompose_df = vect_decompose(all_df, col, model_name=model_name, n_components=20)

    # pickleファイルに格納
    decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name))

PropertyType Amenities BathroomsText

In [None]:
# 前処理
all_df = preprocess_amenities(all_df, "Amenities")

In [None]:
# 英語
for col in col_list2:
    
    print("【{} start】".format(col))

    # NaNを変換
    all_df = preprocess_text(all_df, col)

    # 分散表現＋SVD
    decompose_df = vect_decompose(all_df, col, model_name='bert-base-uncased', n_components=20)

    # pickleファイルに格納
    decompose_df.to_pickle(features_path + "{}_vect_svd.pkl".format(col))

Review

In [None]:
col = "Review"

model_name = 'bert-base-uncased'

# NaNを変換
review_df = preprocess_text(review_df, col)

# 分散表現＋SVD
decompose_df = vect_decompose(review_df, col, model_name=model_name, n_components=20)

# pickleファイルに格納
decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name)))

conbine_PT_Am_BT

In [None]:
# NaNを変換
all_df = preprocess_text(all_df, "BathroomsText")

col = "conbine_PT_Am_BT"

model_name = 'bert-base-uncased'

# 前処理
all_df = preprocess_pt_am_bt(all_df)

# 分散表現＋SVD
decompose_df = vect_decompose(all_df, col, model_name=model_name, n_components=20)

# pickleファイルに格納
decompose_df.to_pickle(features_path + "{}_vect_svd_{}.pkl".format(col, model_name))

## BertClassificationModel

OwnerDetail

In [27]:
%%time
col="OwnerDetail"
model_name = "/content/drive/MyDrive/competitions/probspace_pricing/notebooks/bert_reg/{}".format(col) 
tokenizer_name = "roberta-base"
max_len = 512
n_components=20
add_special_tokens=True

print("【{} ClassificationBert start】".format(col))

df = all_df.copy()

# NaNを変換
df = preprocess_text(df, col)

# 分散表現
BSV = BertClassificationSequenceVectorizer(tokenizer_name, model_name, max_len, add_special_tokens)
df[col] = df[col].apply(lambda x: BSV.vectorize(x))

# 次元削減
decomponents_df = decompose_func(df, col, n_components=n_components)

# merge
df = pd.merge(df["ID"], decomponents_df, left_index=True, right_index=True)


# pickleファイルに格納
df.to_pickle(features_path + "{}_reg_en_vect_svd.pkl".format(col))

【OwnerDetail ClassificationBert start】


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

CPU times: user 11min 39s, sys: 6.8 s, total: 11min 46s
Wall time: 12min 21s


Description

In [None]:
%%time
col="Description"
model_name = "/content/drive/MyDrive/competitions/probspace_pricing/notebooks/bert_reg/{}".format(col) 
tokenizer_name = "roberta-base"
max_len = 512
n_components=20
add_special_tokens=False

print("【{} ClassificationBert start】".format(col))

df = all_df.copy()

# NaNを変換
df = preprocess_text(df, col)

# 分散表現
BSV = BertClassificationSequenceVectorizer(tokenizer_name, model_name, max_len, add_special_tokens)
df[col] = df[col].apply(lambda x: BSV.vectorize(x))

# 次元削減
decomponents_df = decompose_func(df, col, n_components=n_components)

# merge
df = pd.merge(df["ID"], decomponents_df, left_index=True, right_index=True)


# pickleファイルに格納
df.to_pickle(features_path + "{}_reg_en_vect_svd.pkl".format(col))

conbine_PT_Am

In [133]:
%%time
#　英語
col = "conbine_PT_Am"
model_name = "/content/drive/MyDrive/competitions/probspace_pricing/notebooks/bert_reg/{}".format(col)
tokenizer_name = "roberta-base"
max_len = 512
n_components=20
add_special_tokens=False

print("【{} ClassificationBert start】".format(col))


df = all_df.copy()

# 不要な文字を削除
df = preprocess_amenities(df, "Amenities")

# 連結
df = preprocess_pt_am(df)


# 分散表現
BSV = BertClassificationSequenceVectorizer(tokenizer_name, model_name, max_len, add_special_tokens)
df[col] = df[col].apply(lambda x: BSV.vectorize(x))

# 次元削減
decomponents_df = decompose_func(df, col, n_components=n_components)


# merge
df = pd.merge(df["ID"], decomponents_df, left_index=True, right_index=True)

# pickleファイルに格納
df.to_pickle(features_path + "{}_reg_vect_svd.pkl".format(col))

【conbine_PT_Am ClassificationBert start】


Review

In [28]:
%%time
col = "Review"
model_name = "/content/drive/MyDrive/competitions/probspace_pricing/notebooks/bert_reg/{}".format(col) 
tokenizer_name = "roberta-base"
max_len = 512
n_components=20
add_special_tokens=False

print("【{} ClassificationBert start】".format(col))

review = review_df.copy()

# NaNを変換
review = preprocess_text(review, col)

# ReviewerNameとReviewを結合
review = preprocess_review(review)

# 分散表現
BSV = BertClassificationSequenceVectorizer(tokenizer_name, model_name, max_len, add_special_tokens)
review[col] = review[col].apply(lambda x: BSV.vectorize(x))

# 次元削減
decomponents_df = decompose_func(review, col, n_components=n_components)

# merge
df = pd.merge(review["ID"], decomponents_df, left_index=True, right_index=True)


# pickleファイルに格納
df.to_pickle(features_path + "{}_reg_en_vect_svd.pkl".format(col))

【Review ClassificationBert start】


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]