In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install tez

import tez
import torch
import torch.nn as nn
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class BertDataset:
    def __init__(self, texts, targets, max_len = 64):
        self.texts = texts
        self.targets = targets
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased",
            do_lower_case = False
            )
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            truncation = True
        )
        resp = {
            'ids': torch.tensor(inputs['input_ids'], dtype = torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype = torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype = torch.long),
            'targets': torch.tensor(self.targets[idx], dtype = torch.long),
        }
        return resp

In [None]:
class TextModel(tez.Model):
    def __init__(self, num_classes, num_train_steps):
        super().__init__()
        self.bert = transformers.BertModel.from_pretrained(
            'bert-base-uncased', return_dict = False)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = 'batch'
        
    def fetch_optimizer(self):
        opt = AdamW(self.parameters(), lr = 3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps = 0,
            num_training_steps = self.num_train_steps)
        return sch

    def loss(self, outputs, targets):
        return nn.CrossEntropyLoss()(outputs, targets)

    def monitor_metrics(self, outputs, targets):
        out = torch.argmax(outputs, axis = 1).cpu().detach().numpy()
        tag = targets.cpu().detach().numpy()

        return {'accuracy' : accuracy_score(out, tag)}

    def forward(self, ids, mask, token_type_ids, targets = None):
        _, x = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        x = self.bert_drop(x)
        x = self.out(x)
        if targets is not None:
            loss = self.loss(x, targets)
            met = self.monitor_metrics(x, targets)
            return x, loss, met
        return x, 0, {}

In [None]:
def train_model(fold, df, train_idx, val_idx):
    df_train = df.iloc[train_idx, :].reset_index(drop = True)
    df_val = df.iloc[val_idx, :].reset_index(drop = True)

    train_dataset = BertDataset(df_train.Review.values, df_train.Rating.values)
    val_dataset = BertDataset(df_val.Review.values, df_val.Rating.values)
    whole_dataset = BertDataset(df.Review.values, df.Rating.values)
    
    n_train_steps = int(len(df_train) / TRAIN_BS * EPOCHS)
    model = TextModel(num_classes = 5,
                      num_train_steps= n_train_steps)

    es = tez.callbacks.EarlyStopping(monitor = 'valid_loss', patience = 1, model_path="model.bin")
    model.fit(
        train_dataset,
        valid_dataset = val_dataset,
        device = DEVICE,
        epochs = 200,
        train_bs = 32,
        callbacks = [es])
    model.load('model.bin', device = DEVICE)
    pred = model.predict(whole_dataset, device = DEVICE)
    return pd.Series([p for p in pred])
    

In [None]:
df = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
df.Rating -= 1

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size = 0.9, test_size = 0.1, random_state = 0)

In [None]:
TRAIN_BS = 32
EPOCHS = 200
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
df['pred'] = 0
pred_folds = {}
for i, (train_idx, val_idx) in enumerate(skf.split(X = train, y = train.Rating)):
    pred = train_model(fold = i, df = df, train_idx = train_idx, val_idx = val_idx)
    pred_folds[i] = pred

In [None]:
prob_preds = np.zeros((len(df), 5))
for i in range(5):
    for j in range(1281):
        for k in range(16):
            try:
                prob_preds[16 * j + k] += pred_folds[i][j][k]
            except:
                break
preds = pd.DataFrame(np.argmax(prob_preds, axis = 1))

In [None]:
print('Train CV accuracy:', accuracy_score(train.Rating, preds.iloc[train.index, :]))
print('Test accuracy:', accuracy_score(test.Rating, preds.iloc[test.index, :]))