# Install library

In [1]:
!pip install transformers torchmetrics

[0m

## Import library

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchmetrics
import albumentations
import albumentations.pytorch

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from PIL import Image

## Config path
**Note that**: Test file is **self-labelled** with only for the verification purposes and maybe not match with real test. We are sure that we **don't** use test file for training progress.

In [3]:
DATASET_TRAIN_TEXT_FILE = '/kaggle/input/machine-learning-preproc/train_merged_segmented_v2.csv'
DATASET_TRAIN_IMAGE_FOLDER = '/kaggle/input/int3405-ml-preprocess-image-data-v2/image_train_resized'
DATASET_TEST_TEXT_FILE = '/kaggle/input/machine-learning-preproc/test_labelled_segmented.csv'
DATASET_TEST_IMAGE_FOLDER = '/kaggle/input/int3405-ml-preprocess-image-data-v2/image_test_resized'

## Import PhoBERT tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

## Import dataset

In [5]:
df = pd.read_csv(DATASET_TRAIN_TEXT_FILE)
X = df["Comment"].tolist()
X_rev = df["RevId"].tolist()
Y = df["Rating"].to_numpy()

In [6]:
image_transform = albumentations.Compose([
    albumentations.Normalize(mean=[0.0], std=[1.0], max_pixel_value=255.0),
    albumentations.pytorch.ToTensorV2()
])

In [7]:
X_train = []
for id in tqdm(range(len(X))):
    try:
        im = Image.open(f"{DATASET_TRAIN_IMAGE_FOLDER}/{str(X_rev[id])}.jpg")
        image = np.array(im, dtype=np.float32)
        im.close()
        encoding = tokenizer.encode_plus(
            X[id],
            truncation=True,
            add_special_tokens=True,
            max_length=140,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        image = image_transform(image=image)['image']
        X_train.append((input_ids, attention_mask, image, Y[id]))
    except:
        pass

train_loader = DataLoader(X_train, batch_size=64, shuffle=True)

100%|██████████| 12041/12041 [01:34<00:00, 127.65it/s]


In [8]:
test_d = pd.read_csv(DATASET_TEST_TEXT_FILE)
Rev_d = test_d["RevId"].tolist()
X_d = test_d["Comment"].tolist()
X_d_clone = test_d["Comment"].tolist()
Y_d = test_d["Rating"].to_numpy()

In [9]:
X_test = []
for id in tqdm(range(len(X_d))):
    try:
        im = Image.open(f"{DATASET_TEST_IMAGE_FOLDER}/{str(Rev_d[id])}.jpg")
        image = np.array(im, dtype=np.float32)
        im.close()
        encoding = tokenizer.encode_plus(
            X_d[id],
            truncation=True,
            add_special_tokens=True,
            max_length=140,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
          )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()     
        image = image_transform(image=image)['image']
        X_test.append((input_ids, attention_mask, image, Y_d[id]))
    except: 
        pass

test_loader = DataLoader(X_test, batch_size=64, shuffle=False)

100%|██████████| 5103/5103 [00:44<00:00, 113.55it/s]


## Build Metrics

In [10]:
class Metrics():
    def __init__(self):
        self.loss = 0.0
        self.precision = 0.0
        self.recall = 0.0
        self.f1 = 0.0
        self.accuracy = 0.0
        
    def avg_compute_fn(self, previous_metric_value, added_value, length):
        return (previous_metric_value * length + added_value) / (length+1)

    def compute_loss(self, loss, idx):
        self.loss = self.avg_compute_fn(self.loss, loss, idx)

    def compute_metrics(self, y_pred, y_true, idx):
        self.precision  = self.avg_compute_fn(self.precision, torchmetrics.functional.classification.binary_precision(y_pred, y_true).item(), idx)
        self.recall     = self.avg_compute_fn(self.recall, torchmetrics.functional.classification.binary_recall(y_pred, y_true).item(), idx)
        self.f1         = self.avg_compute_fn(self.f1, torchmetrics.functional.classification.binary_f1_score(y_pred, y_true).item(), idx)
        self.accuracy   = self.avg_compute_fn(self.accuracy, torchmetrics.functional.classification.binary_accuracy(y_pred, y_true).item(), idx)

    def print(self, type):
        print(f"[{type}] Loss: {self.loss:.4f},"
            f"   Precision: {self.precision:.4f},"
            f"   Recall: {self.recall:.4f},"
            f"   F1: {self.f1:.4f},"
            f"   Accuracy: {self.accuracy:.4f}")

## Build Utilities

In [11]:
from sklearn.metrics import roc_auc_score

def test_roc_auc(result, threshold):
    result_d = np.asarray(result).squeeze(1)
    result_d[result_d >= threshold] = 1
    result_d[result_d < threshold] = 0
    roc_auc = roc_auc_score(Y_d, result_d)
    return roc_auc

def search_threshold(result):
    l = 0.0
    r = 1.0 
    eps = 0.001
    ans = 0.0
    best_thold = 0.0
    for i in np.arange(l, r, eps):
        res = test_roc_auc(result, i)
        if res > ans:
            ans = res
            best_thold = i
    return best_thold

In [12]:
from sklearn.metrics import roc_auc_score
def fit(model, train_data, valid_data, loss_fn, n_epoch, optimizer):
    best_result = []
    best_roc = 0.0
    best_threshold = 0.0
    for epoch in range(n_epoch):
        loop = tqdm(train_data)
        metrics = Metrics()
        model.train()
        
        for idx, (input1, input2, images, labels) in enumerate(loop):
            input1 = input1.cuda()
            input2 = input2.cuda()
            images = images.cuda()
            labels = labels.unsqueeze(1).cuda()
            predict = model(input1, input2, images)
            loss = loss_fn(predict, labels.to(torch.float32))
            metrics.compute_loss(loss.item(), idx)
            metrics.compute_metrics(predict, labels.int(), idx)
            loop.set_postfix(loss=metrics.loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"[Epoch] {epoch}")
        metrics.print("Train")

        with torch.no_grad():
            result = []
            metrics = Metrics()
            model.eval()
            for idx, (input1, input2, images, labels) in enumerate(valid_data):
                input1 = input1.cuda()
                input2 = input2.cuda()
                images = images.cuda()
                labels = labels.unsqueeze(1).cuda()
                predict = model(input1, input2, images)
                result.extend(predict.tolist())
                loss = loss_fn(predict, labels.to(torch.float32))
                metrics.compute_loss(loss.item(), idx)
                metrics.compute_metrics(predict, labels.int(), idx)
            metrics.print("Valid")
            
        roc_auc = test_roc_auc(result, 0.6)
        print("roc_auc_with_[threshold=0.6]=", roc_auc)
        current_best_threshold = search_threshold(result)
        current_best_roc_auc = test_roc_auc(result, current_best_threshold)
        print("found_best_rou_auc_threshold_at=", current_best_threshold, ", best_roc_auc=", current_best_roc_auc)
        
        if current_best_roc_auc > best_roc:
            best_roc = current_best_roc_auc
            best_threshold = current_best_threshold
            best_result = [x for x in result]
            torch.save(model.state_dict(), 'checkpoint.pt')
    return (best_result, best_threshold)

## Build Model

In [13]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.bert = AutoModel.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
        self.bert_clf = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(self.bert.config.hidden_size, 256),
        )
        self.resnet = torchvision.models.vgg11_bn(pretrained=True)
        self.resnet_clf = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(1000, 256),
        )
        self.final_clf = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(512, 256),
            nn.Dropout(p=0.25),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, input_img):
        last_hidden_state, output = self.bert(input_ids=input_ids, attention_mask=attention_mask,return_dict=False)
        x1 = self.bert_clf(output)
    
        x2 = self.resnet(input_img)
        x2 = self.resnet_clf(x2)
        
        out = torch.cat([x1, x2], dim=1)
        out = self.final_clf(out)
        return out

## Train model

In [14]:
model_baseline = Classifier()
model_baseline.cuda()
loss_fn_baseline = nn.BCELoss()
optimizer = torch.optim.Adam(params=model_baseline.parameters(), lr=2e-5)
result, threshold = fit(model_baseline, train_loader, test_loader, loss_fn_baseline, n_epoch=25, optimizer=optimizer)

Downloading:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

Some weights of the model checkpoint at wonrax/phobert-base-vietnamese-sentiment were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at wonrax/phobert-base-vietnamese-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

  0%|          | 0.00/507M [00:00<?, ?B/s]

100%|██████████| 189/189 [02:45<00:00,  1.14it/s, loss=0.381]


[Epoch] 0
[Train] Loss: 0.3806,   Precision: 0.8561,   Recall: 0.9055,   F1: 0.8772,   Accuracy: 0.8444
[Valid] Loss: 0.2574,   Precision: 0.9618,   Recall: 0.9205,   F1: 0.9400,   Accuracy: 0.9093
roc_auc_with_[threshold=0.6]= 0.8896871664433493
found_best_rou_auc_threshold_at= 0.48 , best_roc_auc= 0.8960498943973837


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.272]


[Epoch] 1
[Train] Loss: 0.2725,   Precision: 0.9009,   Recall: 0.9376,   F1: 0.9178,   Accuracy: 0.8995
[Valid] Loss: 0.2035,   Precision: 0.9662,   Recall: 0.9467,   F1: 0.9559,   Accuracy: 0.9325
roc_auc_with_[threshold=0.6]= 0.9197069243521905
found_best_rou_auc_threshold_at= 0.619 , best_roc_auc= 0.9223440374264755


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.152]


[Epoch] 2
[Train] Loss: 0.1521,   Precision: 0.9472,   Recall: 0.9684,   F1: 0.9571,   Accuracy: 0.9473
[Valid] Loss: 0.1945,   Precision: 0.9906,   Recall: 0.9251,   F1: 0.9564,   Accuracy: 0.9349
roc_auc_with_[threshold=0.6]= 0.942636091113483
found_best_rou_auc_threshold_at= 0.229 , best_roc_auc= 0.9568228374173915


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0406]


[Epoch] 3
[Train] Loss: 0.0406,   Precision: 0.9894,   Recall: 0.9919,   F1: 0.9905,   Accuracy: 0.9883
[Valid] Loss: 0.1795,   Precision: 0.9936,   Recall: 0.9432,   F1: 0.9674,   Accuracy: 0.9506
roc_auc_with_[threshold=0.6]= 0.9580239820134899
found_best_rou_auc_threshold_at= 0.232 , best_roc_auc= 0.9671280630436264


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.02]  


[Epoch] 4
[Train] Loss: 0.0200,   Precision: 0.9967,   Recall: 0.9959,   F1: 0.9961,   Accuracy: 0.9955
[Valid] Loss: 0.2206,   Precision: 0.9950,   Recall: 0.9409,   F1: 0.9669,   Accuracy: 0.9500
roc_auc_with_[threshold=0.6]= 0.9592974587241387
found_best_rou_auc_threshold_at= 0.041 , best_roc_auc= 0.967578861308564


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0141]


[Epoch] 5
[Train] Loss: 0.0141,   Precision: 0.9952,   Recall: 0.9974,   F1: 0.9962,   Accuracy: 0.9954
[Valid] Loss: 0.2952,   Precision: 0.9959,   Recall: 0.9212,   F1: 0.9567,   Accuracy: 0.9355
roc_auc_with_[threshold=0.6]= 0.9510883882542637
found_best_rou_auc_threshold_at= 0.03 , best_roc_auc= 0.966388958281289


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0155]


[Epoch] 6
[Train] Loss: 0.0155,   Precision: 0.9964,   Recall: 0.9966,   F1: 0.9964,   Accuracy: 0.9956
[Valid] Loss: 0.3883,   Precision: 0.9961,   Recall: 0.9101,   F1: 0.9508,   Accuracy: 0.9268
roc_auc_with_[threshold=0.6]= 0.9450478050552995
found_best_rou_auc_threshold_at= 0.008 , best_roc_auc= 0.9665138646015489


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00811]


[Epoch] 7
[Train] Loss: 0.0081,   Precision: 0.9984,   Recall: 0.9984,   F1: 0.9984,   Accuracy: 0.9981
[Valid] Loss: 0.2325,   Precision: 0.9935,   Recall: 0.9519,   F1: 0.9720,   Accuracy: 0.9574
roc_auc_with_[threshold=0.6]= 0.9624060363818046
found_best_rou_auc_threshold_at= 0.07 , best_roc_auc= 0.9713401312651876


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00925]


[Epoch] 8
[Train] Loss: 0.0093,   Precision: 0.9976,   Recall: 0.9978,   F1: 0.9977,   Accuracy: 0.9972
[Valid] Loss: 0.3098,   Precision: 0.9946,   Recall: 0.9336,   F1: 0.9628,   Accuracy: 0.9441
roc_auc_with_[threshold=0.6]= 0.9558000817568643
found_best_rou_auc_threshold_at= 0.078 , best_roc_auc= 0.9689114981945358


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00551]


[Epoch] 9
[Train] Loss: 0.0055,   Precision: 0.9987,   Recall: 0.9991,   F1: 0.9989,   Accuracy: 0.9986
[Valid] Loss: 0.3103,   Precision: 0.9949,   Recall: 0.9434,   F1: 0.9682,   Accuracy: 0.9520
roc_auc_with_[threshold=0.6]= 0.9606366815797243
found_best_rou_auc_threshold_at= 0.013000000000000001 , best_roc_auc= 0.9711804782776554


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00813]


[Epoch] 10
[Train] Loss: 0.0081,   Precision: 0.9975,   Recall: 0.9989,   F1: 0.9982,   Accuracy: 0.9979
[Valid] Loss: 0.4276,   Precision: 0.9953,   Recall: 0.9135,   F1: 0.9523,   Accuracy: 0.9288
roc_auc_with_[threshold=0.6]= 0.9467166670451707
found_best_rou_auc_threshold_at= 0.012 , best_roc_auc= 0.9673675425249245


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0101] 


[Epoch] 11
[Train] Loss: 0.0101,   Precision: 0.9979,   Recall: 0.9969,   F1: 0.9974,   Accuracy: 0.9969
[Valid] Loss: 0.3465,   Precision: 0.9951,   Recall: 0.9247,   F1: 0.9583,   Accuracy: 0.9377
roc_auc_with_[threshold=0.6]= 0.9517232530147844
found_best_rou_auc_threshold_at= 0.009000000000000001 , best_roc_auc= 0.9699558285831081


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00764]


[Epoch] 12
[Train] Loss: 0.0076,   Precision: 0.9986,   Recall: 0.9980,   F1: 0.9983,   Accuracy: 0.9979
[Valid] Loss: 0.3540,   Precision: 0.9943,   Recall: 0.9339,   F1: 0.9628,   Accuracy: 0.9441
roc_auc_with_[threshold=0.6]= 0.9554356959553062
found_best_rou_auc_threshold_at= 0.005 , best_roc_auc= 0.9705559466763565


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0123] 


[Epoch] 13
[Train] Loss: 0.0123,   Precision: 0.9984,   Recall: 0.9981,   F1: 0.9981,   Accuracy: 0.9978
[Valid] Loss: 0.3732,   Precision: 0.9947,   Recall: 0.9406,   F1: 0.9666,   Accuracy: 0.9496
roc_auc_with_[threshold=0.6]= 0.9589330729225809
found_best_rou_auc_threshold_at= 0.012 , best_roc_auc= 0.9724192083210319


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0465]


[Epoch] 14
[Train] Loss: 0.0465,   Precision: 0.9865,   Recall: 0.9859,   F1: 0.9858,   Accuracy: 0.9827
[Valid] Loss: 0.3099,   Precision: 0.9952,   Recall: 0.9200,   F1: 0.9558,   Accuracy: 0.9338
roc_auc_with_[threshold=0.6]= 0.9496796720641336
found_best_rou_auc_threshold_at= 0.041 , best_roc_auc= 0.96576442667999


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0078] 


[Epoch] 15
[Train] Loss: 0.0078,   Precision: 0.9982,   Recall: 0.9984,   F1: 0.9983,   Accuracy: 0.9979
[Valid] Loss: 0.3478,   Precision: 0.9948,   Recall: 0.9187,   F1: 0.9549,   Accuracy: 0.9326
roc_auc_with_[threshold=0.6]= 0.948180796221016
found_best_rou_auc_threshold_at= 0.017 , best_roc_auc= 0.9671383962028478


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00675]


[Epoch] 16
[Train] Loss: 0.0068,   Precision: 0.9969,   Recall: 0.9988,   F1: 0.9977,   Accuracy: 0.9977
[Valid] Loss: 0.4689,   Precision: 0.9952,   Recall: 0.8922,   F1: 0.9405,   Accuracy: 0.9125
roc_auc_with_[threshold=0.6]= 0.9348158199532168
found_best_rou_auc_threshold_at= 0.006 , best_roc_auc= 0.9671975563781708


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.0167]


[Epoch] 17
[Train] Loss: 0.0167,   Precision: 0.9957,   Recall: 0.9967,   F1: 0.9962,   Accuracy: 0.9954
[Valid] Loss: 0.3658,   Precision: 0.9944,   Recall: 0.9279,   F1: 0.9596,   Accuracy: 0.9394
roc_auc_with_[threshold=0.6]= 0.9506094292916676
found_best_rou_auc_threshold_at= 0.011 , best_roc_auc= 0.9687725115254469


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00842]


[Epoch] 18
[Train] Loss: 0.0084,   Precision: 0.9980,   Recall: 0.9985,   F1: 0.9982,   Accuracy: 0.9977
[Valid] Loss: 0.3706,   Precision: 0.9954,   Recall: 0.9194,   F1: 0.9555,   Accuracy: 0.9335
roc_auc_with_[threshold=0.6]= 0.9482606227147822
found_best_rou_auc_threshold_at= 0.006 , best_roc_auc= 0.9717495968932393


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00732]


[Epoch] 19
[Train] Loss: 0.0073,   Precision: 0.9988,   Recall: 0.9981,   F1: 0.9985,   Accuracy: 0.9980
[Valid] Loss: 0.2952,   Precision: 0.9947,   Recall: 0.9395,   F1: 0.9660,   Accuracy: 0.9486
roc_auc_with_[threshold=0.6]= 0.9593077918833601
found_best_rou_auc_threshold_at= 0.016 , best_roc_auc= 0.9741575636454477


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00443]


[Epoch] 20
[Train] Loss: 0.0044,   Precision: 0.9992,   Recall: 0.9989,   F1: 0.9991,   Accuracy: 0.9988
[Valid] Loss: 0.3086,   Precision: 0.9949,   Recall: 0.9352,   F1: 0.9639,   Accuracy: 0.9456
roc_auc_with_[threshold=0.6]= 0.9551408034882928
found_best_rou_auc_threshold_at= 0.007 , best_roc_auc= 0.9743275497922013


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00459]


[Epoch] 21
[Train] Loss: 0.0046,   Precision: 0.9988,   Recall: 0.9986,   F1: 0.9987,   Accuracy: 0.9984
[Valid] Loss: 0.3185,   Precision: 0.9941,   Recall: 0.9455,   F1: 0.9690,   Accuracy: 0.9529
roc_auc_with_[threshold=0.6]= 0.9615561056480367
found_best_rou_auc_threshold_at= 0.013000000000000001 , best_roc_auc= 0.9737584311766175


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00383]


[Epoch] 22
[Train] Loss: 0.0038,   Precision: 0.9992,   Recall: 0.9982,   F1: 0.9987,   Accuracy: 0.9982
[Valid] Loss: 0.3485,   Precision: 0.9947,   Recall: 0.9470,   F1: 0.9700,   Accuracy: 0.9545
roc_auc_with_[threshold=0.6]= 0.9624304498898553
found_best_rou_auc_threshold_at= 0.006 , best_roc_auc= 0.9749624145527219


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00875]


[Epoch] 23
[Train] Loss: 0.0087,   Precision: 0.9976,   Recall: 0.9984,   F1: 0.9980,   Accuracy: 0.9975
[Valid] Loss: 0.5136,   Precision: 0.9950,   Recall: 0.9061,   F1: 0.9481,   Accuracy: 0.9228
roc_auc_with_[threshold=0.6]= 0.9415607612472463
found_best_rou_auc_threshold_at= 0.004 , best_roc_auc= 0.9701605613971339


100%|██████████| 189/189 [02:38<00:00,  1.19it/s, loss=0.00523]


[Epoch] 24
[Train] Loss: 0.0052,   Precision: 0.9990,   Recall: 0.9974,   F1: 0.9981,   Accuracy: 0.9978
[Valid] Loss: 0.4196,   Precision: 0.9951,   Recall: 0.9303,   F1: 0.9613,   Accuracy: 0.9419
roc_auc_with_[threshold=0.6]= 0.9548008311947856
found_best_rou_auc_threshold_at= 0.008 , best_roc_auc= 0.9723741284945382


## Verify result and export submission

In [15]:
result_d = np.asarray(result).squeeze(1)
result_d[result_d >= threshold] = 1
result_d[result_d < threshold] = 0

In [16]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(Y_d, result_d)
print(roc_auc)

0.9749624145527219


In [17]:
out = []
for id in range(len(result_d)):
    out.append([Rev_d[id], int(result_d[id])])
review_d = pd.DataFrame(out, columns=['RevId', 'Rating'])
review_d

Unnamed: 0,RevId,Rating
0,781115,0
1,1219481,0
2,1703765,1
3,4870346,0
4,2638711,1
...,...,...
5098,1025826,1
5099,1278470,1
5100,2565212,0
5101,3766155,0


In [18]:
review_d.to_csv("submission.csv", index=False)