In [1]:
import pickle

import numpy as np
import pandas as pd
import torch
from cuml.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
# from sklearn.svm import SVC # gpuがなければこっち

In [2]:
exp = "exp021"

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
clothing_master_df = pd.read_csv("../data/clothing_master.csv")
sample_submission_df = pd.read_csv("../data/sample_submission.csv")

# Preprocessing

In [4]:
train_column_names = {
    "Clothing ID": "clothing_id",
    "Age": "age",
    "Title": "title",
    "Review Text": "review_text",
    "Rating": "rating",
    "Recommended IND": "recommended",
    "Positive Feedback Count": "positive_feedback_count",
}

test_column_names = {
    "Clothing ID": "clothing_id",
    "Age": "age",
    "Title": "title",
    "Review Text": "review_text",
}

clothing_master_column_names = {
    "Clothing ID": "clothing_id",
    "Division Name": "division_name",
    "Department Name": "department_name",
    "Class Name": "class_name",
}

train_df = train_df.rename(columns=train_column_names)
test_df = test_df.rename(columns=test_column_names)
clothing_master_df = clothing_master_df.rename(columns=clothing_master_column_names)

train_df = pd.merge(train_df, clothing_master_df, on="clothing_id", how="left")
test_df = pd.merge(test_df, clothing_master_df, on="clothing_id", how="left")

# Feature Engineering

In [5]:
class EmbDataset(Dataset):
    def __init__(self, texts, max_length=192, tokenizer=None):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        token = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True,
        )
        return {
            "input_ids": torch.LongTensor(token["input_ids"]),
            "attention_mask": torch.LongTensor(token["attention_mask"]),
            "token_type_ids": torch.LongTensor(token["token_type_ids"]),
        }

In [6]:
model_name = "intfloat/e5-mistral-7b-instruct"
file_name = "e5_mistral"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def create_text_column(df: pd.DataFrame, sep_token: str) -> pd.DataFrame:
    sep_token = " " if sep_token is None else sep_token
    text_df = df.copy()
    text_df["text"] = (
        "Instruction: Does the reviewer recommend the clothes based on the review and title?"
        ". The Reiviewer's age is: <"
        + text_df["age"].fillna("nan").astype(str)
        + ">. The review title is: <"
        + text_df["title"].fillna("nan").astype(str)
        + ">. The review text is: <"
        + text_df["review_text"].fillna("nan").astype(str)
        + ">. Will the reviewer recommend this cloth?"
    )
    return text_df


train_df = create_text_column(train_df, sep_token=tokenizer.sep_token)
test_df = create_text_column(test_df, sep_token=tokenizer.sep_token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = input_mask_expanded.sum(1)
    return sum_embeddings / sum_mask


embeddings = {}
for key, df in zip(["train", "test"], [train_df, test_df]):
    emb_list = []
    dataset = EmbDataset(df["text"].values, max_length=192, tokenizer=tokenizer)
    data_loader = DataLoader(
        dataset,
        batch_size=128,
        num_workers=0,
        shuffle=False,
    )
    bar = tqdm(enumerate(data_loader), total=len(data_loader))
    for iter_i, batch in bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            batch_embs = mean_pooling(outputs.last_hidden_state, attention_mask)
        emb_list.append(batch_embs.detach().cpu().numpy())
    embeddings[key] = np.concatenate(emb_list)


def create_embedding_df(text_embeddings: np.array) -> pd.DataFrame:
    text_columns = [f"embedding_{i}" for i in range(text_embeddings.shape[1])]

    embedding_df = pd.DataFrame(
        {
            **dict(zip(text_columns, text_embeddings.T)),
        }
    )
    return embedding_df


train_embedding_df = create_embedding_df(embeddings["train"])
test_embedding_df = create_embedding_df(embeddings["test"])

train_embedding_df.to_csv(f"../outputs/train_embedding_{file_name}.csv", index=False)
test_embedding_df.to_csv(f"../outputs/test_embedding_{file_name}.csv", index=False)

In [8]:
train_embedding_df = pd.read_csv(f"../outputs/train_embedding_{file_name}.csv")
test_embedding_df = pd.read_csv(f"../outputs/test_embedding_{file_name}.csv")

embedding_feature_names = train_embedding_df.columns

# Train

In [10]:
train_embedding_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_4086,embedding_4087,embedding_4088,embedding_4089,embedding_4090,embedding_4091,embedding_4092,embedding_4093,embedding_4094,embedding_4095
0,-0.33800,2.717,2.230,-0.70460,0.5537,-3.908,-2.1930,4.504,1.4950,-2.865,...,-2.258,0.47490,0.7007,2.9700,2.658,13.850,3.473,1.1460,-0.5093,2.059
1,-0.93000,3.414,1.290,-1.75000,0.6530,-3.451,-3.4860,4.680,0.7593,-4.562,...,-1.515,0.48300,0.8660,2.1200,1.125,12.360,1.647,2.0020,-2.5820,1.602
2,-0.77000,3.533,1.726,-0.05994,0.5317,-4.043,-4.4800,5.434,1.5150,-2.418,...,-1.033,-1.10400,0.2095,2.2870,2.947,7.250,3.842,1.3080,-1.2250,1.265
3,0.08655,4.010,1.444,-2.12000,1.1880,-3.865,-2.8050,3.880,1.3040,-3.105,...,-2.127,0.01685,1.1250,2.7130,2.357,17.730,1.693,0.8560,-1.2750,2.450
4,0.46600,3.111,1.509,-1.18400,1.4795,-4.980,-1.9070,4.152,1.4380,-2.244,...,-2.033,-0.55800,0.7036,0.2438,1.902,-0.658,1.731,0.9077,-1.2560,2.209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.37800,2.254,1.872,-0.07196,0.4004,-4.266,-1.8530,3.572,1.7460,-3.463,...,-2.129,0.39800,0.5810,5.6450,1.859,14.030,3.793,1.1190,-3.7010,1.421
9996,0.53500,3.516,1.463,-1.77000,0.4807,-5.660,-1.2690,4.484,2.3440,-3.994,...,-1.512,-0.17330,0.8335,2.1250,1.623,11.540,4.000,0.8086,-1.8240,1.656
9997,0.20310,2.893,0.806,-1.07200,0.6560,-3.635,-3.0000,4.594,1.7190,-1.063,...,-1.039,-0.27540,0.6850,2.2600,2.383,5.695,3.918,1.0090,-2.3570,0.510
9998,-0.43290,1.993,1.505,-1.59300,0.4797,-5.110,-1.6370,4.203,2.3280,-3.213,...,-1.579,0.31050,0.1473,2.8100,1.652,16.080,2.613,1.6600,-2.1900,1.560


In [11]:
labels = train_df["recommended"].to_numpy()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []
oof = np.zeros_like(labels, dtype=float)
test_preds = []

for train_index, valid_index in cv.split(train_df, labels):
    train_features = train_embedding_df.iloc[train_index].reset_index(drop=True)
    valid_features = train_embedding_df.iloc[valid_index].reset_index(drop=True)

    train_labels = labels[train_index]
    valid_labels = labels[valid_index]

    model = SVC(probability=True)
    model.fit(train_features, train_labels)

    valid_pred = model.predict_proba(valid_features).to_numpy()[:, 1]
    oof[valid_index] = valid_pred
    test_pred = model.predict_proba(test_embedding_df).to_numpy()[:, 1]
    test_preds.append(test_pred)

    auc = roc_auc_score(valid_labels, valid_pred)
    auc_scores.append(auc)
    print(f"AUC: {auc}")

test_preds = np.mean(test_preds, axis=0)
pickle.dump(oof, open(f"../outputs/oof{exp}.pkl", "wb"))
overall_auc = roc_auc_score(labels, oof)
print(f"Overall AUC: {overall_auc: .4f}")

AUC: 0.963326594457147
AUC: 0.9657174526708527
AUC: 0.9658384146341463
AUC: 0.9591353319783197
AUC: 0.9700423441734417
Overall AUC:  0.9648


In [13]:
sample_submission_df["target"] = test_preds
sample_submission_df

Unnamed: 0,target
0,0.996966
1,0.847690
2,0.991780
3,0.370026
4,0.937510
...,...
11150,0.986862
11151,0.991425
11152,0.996870
11153,0.978660


In [14]:
sample_submission_df.to_csv(f"../outputs/submission_{exp}.csv", index=False)