In [None]:
import csv
from typing import List
from tqdm import tqdm
import numpy as np
from transformers import AutoModel, AutoTokenizer

from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import StandardScaler

cache = "/kaggle/working/hf_model"

class TransformerExtractor:
    def __init__(self, name="distilbert-base-uncased"):
        self.tokenizer = AutoTokenizer.from_pretrained(name, cache_dir=cache)
        self.model = AutoModel.from_pretrained(name, cache_dir=cache)

    def extract(self, texts: List[str]) -> np.array:
        feats = np.zeros((len(texts), 768), dtype=np.float16)
        for itt, text in enumerate(tqdm(texts)):
            tokenized_text = self.tokenizer(text, return_tensors="pt")
            model_output = self.model(**tokenized_text)[0].detach().cpu()
            feats[itt, :] = model_output.numpy().mean(axis=1)
        return feats


def read_csvfile_to_rows(filename: str) -> List[List[str]]:
    with open(filename) as f:
        csvreader = csv.reader(f, delimiter=",", quotechar='"')
        return [row for row in csvreader]


def write_results_file(ids, preds, filename) -> None:
    with open(filename, "w") as to_file:
        csvwriter = csv.writer(to_file, delimiter=",", quotechar='"')
        csvwriter.writerow(["PassengerId", "Survived"])
        for id, prediction in zip(ids, preds):
            csvwriter.writerow([id, prediction])


def titanic_using_transformers():
    extractor = TransformerExtractor()
    scaler = StandardScaler()
    classifier = XGBClassifier(use_label_encoder=False)

    rows = read_csvfile_to_rows("/kaggle/input/titanic/train.csv")
    train_labels = [int(row[1]) for row in rows[1:]]
    texts = [", ".join(row[2:]) for row in rows[1:]]
    train_features = scaler.fit_transform(extractor.extract(texts))
    classifier.fit(train_features, train_labels)

    rows = read_csvfile_to_rows("/kaggle/input/titanic/test.csv")
    ids = [row[0] for row in rows[1:]]
    texts = [", ".join(row[1:]) for row in rows[1:]]
    preds = classifier.predict(scaler.transform(extractor.extract(texts)))
    write_results_file(ids, preds, "submission.csv")


titanic_using_transformers()