# Install library

In [1]:
!pip install transformers torchmetrics

[0m

## Import library

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchmetrics
import albumentations
import albumentations.pytorch

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from PIL import Image

## Config path
**Note that**: Test file is **self-labelled** with only for the verification purposes. We are sure that we **don't** use test file for training progress.

In [3]:
DATASET_TRAIN_TEXT_FILE = '/kaggle/input/machine-learning-preproc/train_merged_segmented_v2.csv'
DATASET_TRAIN_IMAGE_FOLDER = '/kaggle/input/int3405-ml-preprocess-image-data-v2/image_train_resized'
DATASET_TEST_TEXT_FILE = '/kaggle/input/machine-learning-preproc/test_labelled_segmented.csv'
DATASET_TEST_IMAGE_FOLDER = '/kaggle/input/int3405-ml-preprocess-image-data-v2/image_test_resized'

## Import PhoBERT tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

## Import dataset

In [5]:
df = pd.read_csv(DATASET_TRAIN_TEXT_FILE)
X = df["Comment"].tolist()
X_rev = df["RevId"].tolist()
Y = df["Rating"].to_numpy()

In [6]:
image_transform = albumentations.Compose([
    albumentations.Normalize(mean=[0.0], std=[1.0], max_pixel_value=255.0),
    albumentations.pytorch.ToTensorV2()
])

In [7]:
X_train = []
for id in tqdm(range(len(X))):
    try:
        im = Image.open(f"{DATASET_TRAIN_IMAGE_FOLDER}/{str(X_rev[id])}.jpg")
        image = np.array(im, dtype=np.float32)
        im.close()
        encoding = tokenizer.encode_plus(
            X[id],
            truncation=True,
            add_special_tokens=True,
            max_length=140,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        image = image_transform(image=image)['image']
        X_train.append((input_ids, attention_mask, image, Y[id]))
    except:
        pass

train_loader = DataLoader(X_train, batch_size=64, shuffle=True)

100%|██████████| 12041/12041 [01:23<00:00, 144.82it/s]


In [8]:
test_d = pd.read_csv(DATASET_TEST_TEXT_FILE)
Rev_d = test_d["RevId"].tolist()
X_d = test_d["Comment"].tolist()
X_d_clone = test_d["Comment"].tolist()
Y_d = test_d["Rating"].to_numpy()

In [9]:
X_test = []
for id in tqdm(range(len(X_d))):
    try:
        im = Image.open(f"{DATASET_TEST_IMAGE_FOLDER}/{str(Rev_d[id])}.jpg")
        image = np.array(im, dtype=np.float32)
        im.close()
        encoding = tokenizer.encode_plus(
            X_d[id],
            truncation=True,
            add_special_tokens=True,
            max_length=140,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
          )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()     
        image = image_transform(image=image)['image']
        X_test.append((input_ids, attention_mask, image, Y_d[id]))
    except: 
        pass

test_loader = DataLoader(X_test, batch_size=64, shuffle=False)

100%|██████████| 5103/5103 [00:38<00:00, 131.82it/s]


## Build Metrics

In [10]:
class Metrics():
    def __init__(self):
        self.loss = 0.0
        self.precision = 0.0
        self.recall = 0.0
        self.f1 = 0.0
        self.accuracy = 0.0
        
    def avg_compute_fn(self, previous_metric_value, added_value, length):
        return (previous_metric_value * length + added_value) / (length+1)

    def compute_loss(self, loss, idx):
        self.loss = self.avg_compute_fn(self.loss, loss, idx)

    def compute_metrics(self, y_pred, y_true, idx):
        self.precision  = self.avg_compute_fn(self.precision, torchmetrics.functional.classification.binary_precision(y_pred, y_true).item(), idx)
        self.recall     = self.avg_compute_fn(self.recall, torchmetrics.functional.classification.binary_recall(y_pred, y_true).item(), idx)
        self.f1         = self.avg_compute_fn(self.f1, torchmetrics.functional.classification.binary_f1_score(y_pred, y_true).item(), idx)
        self.accuracy   = self.avg_compute_fn(self.accuracy, torchmetrics.functional.classification.binary_accuracy(y_pred, y_true).item(), idx)

    def print(self, type):
        print(f"[{type}] Loss: {self.loss:.4f},"
            f"   Precision: {self.precision:.4f},"
            f"   Recall: {self.recall:.4f},"
            f"   F1: {self.f1:.4f},"
            f"   Accuracy: {self.accuracy:.4f}")

## Build Utilities

In [11]:
from sklearn.metrics import roc_auc_score

def test_roc_auc(result, threshold):
    result_d = np.asarray(result).squeeze(1)
    result_d[result_d >= threshold] = 1
    result_d[result_d < threshold] = 0
    roc_auc = roc_auc_score(Y_d, result_d)
    return roc_auc

def search_threshold(result):
    l = 0.0
    r = 1.0 
    eps = 0.001
    ans = 0.0
    best_thold = 0.0
    for i in np.arange(l, r, eps):
        res = test_roc_auc(result, i)
        if res > ans:
            ans = res
            best_thold = i
    return best_thold

In [12]:
from sklearn.metrics import roc_auc_score
def fit(model, train_data, valid_data, loss_fn, n_epoch, optimizer):
    best_result = []
    best_roc = 0.0
    best_threshold = 0.0
    for epoch in range(n_epoch):
        loop = tqdm(train_data)
        metrics = Metrics()
        model.train()
        
        for idx, (input1, input2, images, labels) in enumerate(loop):
            input1 = input1.cuda()
            input2 = input2.cuda()
            images = images.cuda()
            labels = labels.unsqueeze(1).cuda()
            predict = model(input1, input2, images)
            loss = loss_fn(predict, labels.to(torch.float32))
            metrics.compute_loss(loss.item(), idx)
            metrics.compute_metrics(predict, labels.int(), idx)
            loop.set_postfix(loss=metrics.loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"[Epoch] {epoch}")
        metrics.print("Train")

        with torch.no_grad():
            result = []
            metrics = Metrics()
            model.eval()
            for idx, (input1, input2, images, labels) in enumerate(valid_data):
                input1 = input1.cuda()
                input2 = input2.cuda()
                images = images.cuda()
                labels = labels.unsqueeze(1).cuda()
                predict = model(input1, input2, images)
                result.extend(predict.tolist())
                loss = loss_fn(predict, labels.to(torch.float32))
                metrics.compute_loss(loss.item(), idx)
                metrics.compute_metrics(predict, labels.int(), idx)
            metrics.print("Valid")
            
        roc_auc = test_roc_auc(result, 0.6)
        print("roc_auc_with_[threshold=0.6]=", roc_auc)
        current_best_threshold = search_threshold(result)
        current_best_roc_auc = test_roc_auc(result, current_best_threshold)
        print("found_best_rou_auc_threshold_at=", current_best_threshold, ", best_roc_auc=", current_best_roc_auc)
        
        if current_best_roc_auc > best_roc:
            best_roc = current_best_roc_auc
            best_threshold = current_best_threshold
            best_result = [x for x in result]
    return (best_result, best_threshold)

## Build Model

In [13]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.bert = AutoModel.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
        self.bert_clf = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(self.bert.config.hidden_size, 256),
        )
        self.resnet = torchvision.models.vgg11_bn(pretrained=True)
        self.resnet_clf = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(1000, 256),
        )
        self.final_clf = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(512, 256),
            nn.Dropout(p=0.25),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, input_img):
        last_hidden_state, output = self.bert(input_ids=input_ids, attention_mask=attention_mask,return_dict=False)
        x1 = self.bert_clf(output)
    
        x2 = self.resnet(input_img)
        x2 = self.resnet_clf(x2)
        
        out = torch.cat([x1, x2], dim=1)
        out = self.final_clf(out)
        return out

## Train model

In [14]:
model_baseline = Classifier()
model_baseline.cuda()
loss_fn_baseline = nn.BCELoss()
optimizer = torch.optim.Adam(params=model_baseline.parameters(), lr=2e-5)
result, threshold = fit(model_baseline, train_loader, test_loader, loss_fn_baseline, n_epoch=25, optimizer=optimizer)

Downloading:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

Some weights of the model checkpoint at wonrax/phobert-base-vietnamese-sentiment were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at wonrax/phobert-base-vietnamese-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

  0%|          | 0.00/507M [00:00<?, ?B/s]

100%|██████████| 189/189 [02:43<00:00,  1.16it/s, loss=0.379]


[Epoch] 0
[Train] Loss: 0.3792,   Precision: 0.8571,   Recall: 0.9055,   F1: 0.8779,   Accuracy: 0.8459
[Valid] Loss: 0.2387,   Precision: 0.9586,   Recall: 0.9329,   F1: 0.9450,   Accuracy: 0.9160
roc_auc_with_[threshold=0.6]= 0.8922022119773808
found_best_rou_auc_threshold_at= 0.658 , best_roc_auc= 0.8950928848817932


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.271]


[Epoch] 1
[Train] Loss: 0.2705,   Precision: 0.9036,   Recall: 0.9403,   F1: 0.9203,   Accuracy: 0.9011
[Valid] Loss: 0.1909,   Precision: 0.9669,   Recall: 0.9472,   F1: 0.9565,   Accuracy: 0.9335
roc_auc_with_[threshold=0.6]= 0.9189124066041379
found_best_rou_auc_threshold_at= 0.614 , best_roc_auc= 0.9192317125792019


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.158]


[Epoch] 2
[Train] Loss: 0.1578,   Precision: 0.9449,   Recall: 0.9632,   F1: 0.9530,   Accuracy: 0.9428
[Valid] Loss: 0.1503,   Precision: 0.9882,   Recall: 0.9511,   F1: 0.9690,   Accuracy: 0.9529
roc_auc_with_[threshold=0.6]= 0.9524651965571276
found_best_rou_auc_threshold_at= 0.429 , best_roc_auc= 0.9560386528285605


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0479]


[Epoch] 3
[Train] Loss: 0.0479,   Precision: 0.9853,   Recall: 0.9903,   F1: 0.9876,   Accuracy: 0.9850
[Valid] Loss: 0.1653,   Precision: 0.9935,   Recall: 0.9542,   F1: 0.9732,   Accuracy: 0.9592
roc_auc_with_[threshold=0.6]= 0.9638598323984284
found_best_rou_auc_threshold_at= 0.28200000000000003 , best_roc_auc= 0.9683320464197307


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0231]


[Epoch] 4
[Train] Loss: 0.0231,   Precision: 0.9958,   Recall: 0.9955,   F1: 0.9956,   Accuracy: 0.9946
[Valid] Loss: 0.2534,   Precision: 0.9952,   Recall: 0.9373,   F1: 0.9650,   Accuracy: 0.9472
roc_auc_with_[threshold=0.6]= 0.9574689437467355
found_best_rou_auc_threshold_at= 0.053 , best_roc_auc= 0.970604773692458


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0129]


[Epoch] 5
[Train] Loss: 0.0129,   Precision: 0.9972,   Recall: 0.9972,   F1: 0.9972,   Accuracy: 0.9966
[Valid] Loss: 0.2581,   Precision: 0.9946,   Recall: 0.9401,   F1: 0.9663,   Accuracy: 0.9492
roc_auc_with_[threshold=0.6]= 0.9588879930960871
found_best_rou_auc_threshold_at= 0.065 , best_roc_auc= 0.9704554538641473


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0131] 


[Epoch] 6
[Train] Loss: 0.0131,   Precision: 0.9979,   Recall: 0.9970,   F1: 0.9974,   Accuracy: 0.9965
[Valid] Loss: 0.1879,   Precision: 0.9918,   Recall: 0.9651,   F1: 0.9781,   Accuracy: 0.9665
roc_auc_with_[threshold=0.6]= 0.9671731428701201
found_best_rou_auc_threshold_at= 0.136 , best_roc_auc= 0.9705766129947994


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0253]


[Epoch] 7
[Train] Loss: 0.0253,   Precision: 0.9928,   Recall: 0.9949,   F1: 0.9936,   Accuracy: 0.9927
[Valid] Loss: 0.3005,   Precision: 0.9949,   Recall: 0.9368,   F1: 0.9645,   Accuracy: 0.9465
roc_auc_with_[threshold=0.6]= 0.9567645856516703
found_best_rou_auc_threshold_at= 0.029 , best_roc_auc= 0.9700356550768741


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.0239]


[Epoch] 8
[Train] Loss: 0.0239,   Precision: 0.9946,   Recall: 0.9931,   F1: 0.9936,   Accuracy: 0.9924
[Valid] Loss: 0.2735,   Precision: 0.9951,   Recall: 0.9354,   F1: 0.9640,   Accuracy: 0.9460
roc_auc_with_[threshold=0.6]= 0.9580483955215406
found_best_rou_auc_threshold_at= 0.028 , best_roc_auc= 0.9735226988849273


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00515]


[Epoch] 9
[Train] Loss: 0.0052,   Precision: 0.9987,   Recall: 0.9992,   F1: 0.9990,   Accuracy: 0.9987
[Valid] Loss: 0.2761,   Precision: 0.9952,   Recall: 0.9373,   F1: 0.9651,   Accuracy: 0.9475
roc_auc_with_[threshold=0.6]= 0.9572191311062158
found_best_rou_auc_threshold_at= 0.026000000000000002 , best_roc_auc= 0.9731930597506415


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00514]


[Epoch] 10
[Train] Loss: 0.0051,   Precision: 0.9991,   Recall: 0.9989,   F1: 0.9990,   Accuracy: 0.9988
[Valid] Loss: 0.3555,   Precision: 0.9951,   Recall: 0.9353,   F1: 0.9639,   Accuracy: 0.9458
roc_auc_with_[threshold=0.6]= 0.9566744259986829
found_best_rou_auc_threshold_at= 0.022 , best_roc_auc= 0.9703897077192105


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00451]


[Epoch] 11
[Train] Loss: 0.0045,   Precision: 0.9988,   Recall: 0.9992,   F1: 0.9990,   Accuracy: 0.9988
[Valid] Loss: 0.2663,   Precision: 0.9947,   Recall: 0.9480,   F1: 0.9705,   Accuracy: 0.9554
roc_auc_with_[threshold=0.6]= 0.9633846206254401
found_best_rou_auc_threshold_at= 0.014 , best_roc_auc= 0.9725093679740194


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00399]


[Epoch] 12
[Train] Loss: 0.0040,   Precision: 0.9986,   Recall: 0.9988,   F1: 0.9986,   Accuracy: 0.9983
[Valid] Loss: 0.3283,   Precision: 0.9951,   Recall: 0.9300,   F1: 0.9611,   Accuracy: 0.9417
roc_auc_with_[threshold=0.6]= 0.9555051892898508
found_best_rou_auc_threshold_at= 0.006 , best_roc_auc= 0.974532282606227


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00608]


[Epoch] 13
[Train] Loss: 0.0061,   Precision: 0.9978,   Recall: 0.9988,   F1: 0.9982,   Accuracy: 0.9978
[Valid] Loss: 0.3121,   Precision: 0.9949,   Recall: 0.9337,   F1: 0.9631,   Accuracy: 0.9445
roc_auc_with_[threshold=0.6]= 0.9549708173415393
found_best_rou_auc_threshold_at= 0.018000000000000002 , best_roc_auc= 0.971329798105966


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00801]


[Epoch] 14
[Train] Loss: 0.0080,   Precision: 0.9989,   Recall: 0.9977,   F1: 0.9983,   Accuracy: 0.9979
[Valid] Loss: 0.2858,   Precision: 0.9952,   Recall: 0.9404,   F1: 0.9668,   Accuracy: 0.9498
roc_auc_with_[threshold=0.6]= 0.9580934753480345
found_best_rou_auc_threshold_at= 0.02 , best_roc_auc= 0.970760679490382


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00539]


[Epoch] 15
[Train] Loss: 0.0054,   Precision: 0.9992,   Recall: 0.9986,   F1: 0.9989,   Accuracy: 0.9987
[Valid] Loss: 0.3311,   Precision: 0.9954,   Recall: 0.9377,   F1: 0.9654,   Accuracy: 0.9477
roc_auc_with_[threshold=0.6]= 0.9591274725773852
found_best_rou_auc_threshold_at= 0.004 , best_roc_auc= 0.9726342742942793


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00349]


[Epoch] 16
[Train] Loss: 0.0035,   Precision: 0.9990,   Recall: 0.9993,   F1: 0.9991,   Accuracy: 0.9990
[Valid] Loss: 0.3189,   Precision: 0.9954,   Recall: 0.9292,   F1: 0.9609,   Accuracy: 0.9412
roc_auc_with_[threshold=0.6]= 0.9535517679921877
found_best_rou_auc_threshold_at= 0.013000000000000001 , best_roc_auc= 0.9737725115254469


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00367]


[Epoch] 17
[Train] Loss: 0.0037,   Precision: 0.9993,   Recall: 0.9989,   F1: 0.9991,   Accuracy: 0.9989
[Valid] Loss: 0.3397,   Precision: 0.9951,   Recall: 0.9357,   F1: 0.9642,   Accuracy: 0.9461
roc_auc_with_[threshold=0.6]= 0.9582982081620601
found_best_rou_auc_threshold_at= 0.002 , best_roc_auc= 0.973588445029864


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00483]


[Epoch] 18
[Train] Loss: 0.0048,   Precision: 0.9990,   Recall: 0.9980,   F1: 0.9984,   Accuracy: 0.9982
[Valid] Loss: 0.2597,   Precision: 0.9945,   Recall: 0.9514,   F1: 0.9722,   Accuracy: 0.9576
roc_auc_with_[threshold=0.6]= 0.9649285762950514
found_best_rou_auc_threshold_at= 0.021 , best_roc_auc= 0.9754517066745396


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.016] 


[Epoch] 19
[Train] Loss: 0.0160,   Precision: 0.9967,   Recall: 0.9965,   F1: 0.9965,   Accuracy: 0.9959
[Valid] Loss: 0.3056,   Precision: 0.9955,   Recall: 0.9423,   F1: 0.9678,   Accuracy: 0.9514
roc_auc_with_[threshold=0.6]= 0.9610461472077759
found_best_rou_auc_threshold_at= 0.10300000000000001 , best_roc_auc= 0.9712987986283015


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00769]


[Epoch] 20
[Train] Loss: 0.0077,   Precision: 0.9981,   Recall: 0.9983,   F1: 0.9982,   Accuracy: 0.9978
[Valid] Loss: 0.4244,   Precision: 0.9953,   Recall: 0.9160,   F1: 0.9536,   Accuracy: 0.9309
roc_auc_with_[threshold=0.6]= 0.9472613721527038
found_best_rou_auc_threshold_at= 0.003 , best_roc_auc= 0.9714199577589536


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00634]


[Epoch] 21
[Train] Loss: 0.0063,   Precision: 0.9987,   Recall: 0.9988,   F1: 0.9987,   Accuracy: 0.9984
[Valid] Loss: 0.4920,   Precision: 0.9955,   Recall: 0.8967,   F1: 0.9430,   Accuracy: 0.9157
roc_auc_with_[threshold=0.6]= 0.938767742375037
found_best_rou_auc_threshold_at= 0.002 , best_roc_auc= 0.9685921922194718


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00345]


[Epoch] 22
[Train] Loss: 0.0034,   Precision: 0.9996,   Recall: 0.9990,   F1: 0.9993,   Accuracy: 0.9992
[Valid] Loss: 0.3083,   Precision: 0.9944,   Recall: 0.9472,   F1: 0.9699,   Accuracy: 0.9545
roc_auc_with_[threshold=0.6]= 0.9621806372493358
found_best_rou_auc_threshold_at= 0.029 , best_roc_auc= 0.9729432471101218


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00353]


[Epoch] 23
[Train] Loss: 0.0035,   Precision: 0.9990,   Recall: 0.9993,   F1: 0.9991,   Accuracy: 0.9989
[Valid] Loss: 0.3859,   Precision: 0.9951,   Recall: 0.9362,   F1: 0.9645,   Accuracy: 0.9465
roc_auc_with_[threshold=0.6]= 0.9581733018418005
found_best_rou_auc_threshold_at= 0.005 , best_roc_auc= 0.9722839688415505


100%|██████████| 189/189 [02:37<00:00,  1.20it/s, loss=0.00447]


[Epoch] 24
[Train] Loss: 0.0045,   Precision: 0.9988,   Recall: 0.9991,   F1: 0.9989,   Accuracy: 0.9987
[Valid] Loss: 0.4309,   Precision: 0.9948,   Recall: 0.9167,   F1: 0.9537,   Accuracy: 0.9307
roc_auc_with_[threshold=0.6]= 0.9470566393386777
found_best_rou_auc_threshold_at= 0.016 , best_roc_auc= 0.9686166057275226


## Verify result and export submission

In [15]:
result_d = np.asarray(result).squeeze(1)
result_d[result_d >= threshold] = 1
result_d[result_d < threshold] = 0

In [16]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(Y_d, result_d)
print(roc_auc)

0.9754517066745396


In [17]:
out = []
for id in range(len(result_d)):
    out.append([Rev_d[id], int(result_d[id])])
review_d = pd.DataFrame(out, columns=['RevId', 'Rating'])
review_d

Unnamed: 0,RevId,Rating
0,781115,0
1,1219481,0
2,1703765,1
3,4870346,0
4,2638711,1
...,...,...
5098,1025826,1
5099,1278470,1
5100,2565212,0
5101,3766155,0


In [18]:
review_d.to_csv("submission.csv", index=False)