In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert-base-uncased/vocab.txt
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/config.json
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [13]:
import transformers

DEVICE = "cuda"
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10

BERT_PATH = "../input/bert-base-uncased"
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
BERT_PATH,
    do_lower_case = True
)

In [19]:


class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output


In [20]:
class BERTDataset:
    def __init__(self,review,target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.review)
    def _getitem__(self,item):
        review = str(self.review)
        review = "".join(review.split())
        
        #encodes two strings at a time
        inputs = self.tokenizer.encode_plus(
        review,
        None,
        add_special_tokens =True,#cls tokens
        max_length = self.max_len
        )  
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        padding_length = self.max_len = len(ids)
        ids = ids + ([0] + padding_length)
        mask  = mask +([0] + padding_length)
        token_type_ids = token_type_ids +([0] + padding_length)

        return {
            'ids' :  torch.Tensor(ids,dtype=torch.long),
            'mask' : torch.Tensor(mask,dtype= torch.long),
            'token_type_ids': torch.Tensor(token_type_ids, dtype=torch.long),
            'target' : torch.Tensor(self.target[item],dtype=torch.float)
        }


In [21]:
from tqdm import tqdm 
import torch


def loss_fn():
    return nn.BCEWithLogitsLoss()(outputs,targets)

def train_fn(data_loader,model,optimizer,device, scheduler):
    model.train()
    for bi,d in tqdm(enumerate(data_loader), total = len(data_loader)):
        ids = d["ids"]
        token_type_ids= d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]
        
        ids = ids.to(device,dtype=torch.long)
        token_type_ids = token_type_ids.to(device,dtype=torch.long)
        mask = mask.to(device,dtype=torch.long)
        targets = targets.to(device,dtype=torch.float)
        
        optimizers.zero_grad()
        outputs = model(
            ids = ids, 
            mask = mask,
            token_type_ids  =token_type_ids
        )
        
        loss = loss_fn(outputs,targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    def eval_fn(data_loader,model,device):
        fin_train = []
        fin_outputs = []
        model.eval()
        with torch.no_grad():
            for bi,d in tqdm(enumerate(data_loader), total = len(data_loader)):
                ids = d["ids"]
                token_type_ids= d["token_type_ids"]
                mask = d["mask"]
                targets = d["targets"]

                ids = ids.to(device,dtype=torch.long)
                token_type_ids = token_type_ids.to(device,dtype=torch.long)
                mask = mask.to(device,dtype=torch.long)
                targets = targets.to(device,dtype=torch.float)


                outputs = model(
                    ids = ids, 
                    mask = mask,
                    token_type_ids  = token_type_ids
                    
                )
                
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
                fin_outputs.extend(torch.sigmoid(targets).cpu().detach().numpy().tolist())
           
    return fin_targets,fin_outputs


In [25]:
import pandas as pd
from sklearn import model_selection
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup





def run():
    dfx = pd.read_csv(TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(
    lambda x: 1 if x=="positive" else 0
    )
    
    df_train,df_valid = model_selection.train_test_split(
         dfx,
        test_size = 0.1,
        random_state = 42,
        stratify  =dfx.sentiment.values
    )
    
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    train_dataset = BERTDataset(
        review=df_train.review.values, target=df_train.sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = BERTDataset(
        review=df_valid.review.values, target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device(DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_accuracy = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), MODEL_PATH)
            best_accuracy = accuracy


if __name__ == "__main__":
    run()
    
    

TypeError: train_fn() takes 4 positional arguments but 5 were given

In [None]:
import pandas as pd 
from sklearn import model_selection

In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
df.sentiment =df.sentiment.apply(
lambda x:1 if x=="positive" else 0
)

In [None]:
df["kfold"] = -1

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
y=df.sentiment.values
kf = model_selection.StratifiedKFold(n_splits=5)
for f,(t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,'kfold']=f
    


In [None]:
df.head()

In [None]:
import torch

class IMDBDataset:
    def __init__(self,reviews,targets):
        self.reviews = reviews
        self.target = targets
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self,item):
        review = self.reviews[item,:]
        target = self.target[item]
        
        return{
            "review" : torch.tensor(review,dtype = torch.long),
            "target" : torch.tensor(target,dtype= torch.float)
        }

In [None]:
import torch.nn as nn
class LSTM(nn.Module):
    def __init__(self,eembedding_matrix):
        super(LSTM,self).__init__()
        num_words = embedding_matrix.shape[0]
        embed_dim = embedding_matrix.shape[1]
        self.embedding = nn.Embedding(num_embedding=num_words,embedding_dim = embed_dim)

        self.embedding.weight=nn.Parameter(torch.tensor(
        embedding_matrix,
            dtype=torch.float32
        )
                                          )

        self.embedding.weight.requires_grad=False

        self.lstm = nn.LSTM(embed_dim,
                           128,bidirectional=True,batch=True)

        self.out = nn.Linear(512,1)
    def forward(self,x):
        x= self.embedding(x)
        x,_= self.lstm(x)
        avg_pool = torch.mean(x,1)
        max_pool,_=torch.max(x,1)
        
        
        out=torch.cat((avg_pool,max_pool),1)
        return out
