In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import transformers
import matplotlib.pyplot as plt
import time



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/train.tsv.zip",sep = '\t')
test_data = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/test.tsv.zip",sep = '\t')
train_data.head()

In [None]:
phrase = train_data['Phrase'].values
sentiment = train_data['Sentiment'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(phrase,sentiment,test_size=0.2, random_state=42)
print(len(X_train),len(Y_train),len(X_test),len(Y_test))

In [None]:
from transformers import BertTokenizer,BertModel,BertConfig
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
X_train_tokenized = tokenizer(list(X_train),padding=True,truncation=True,return_tensors='pt')

In [None]:
X_test_tokenized = tokenizer(list(X_test),padding=True,truncation=True,return_tensors='pt')

In [None]:
X_train_tokenized.keys

In [None]:
class MyDataset(Dataset):
    def __init__(self,data,labels):
        self.length = len(data['input_ids'])
        self.input_ids = data['input_ids']
        self.token_type_ids = data['token_type_ids']
        self.attention_mask = data['attention_mask']
        self.labels = labels
    def __getitem__(self,index):
        return self.input_ids[index],self.token_type_ids[index],self.attention_mask[index],self.labels[index]
    def __len__(self):
        return self.length
train_set = MyDataset(X_train_tokenized,Y_train)
test_set = MyDataset(X_test_tokenized,Y_test)

In [None]:
def collate_fn(x):
    inputs = {}
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for i in x:
        input_ids.append(i[0].unsqueeze(0))
        token_type_ids.append(i[1].unsqueeze(0))
        attention_mask.append(i[2].unsqueeze(0))
        labels.append(i[3])
    inputs['input_ids'] = torch.cat(input_ids,dim=0)
    inputs['token_type_ids']=torch.cat(token_type_ids,dim=0)
    inputs['attention_mask']=torch.cat(attention_mask,dim=0)
    return inputs,labels

In [None]:
batch_size = 128
label_nums = len(set(sentiment))

In [None]:
test_dataloader = DataLoader(test_set,batch_size=batch_size,shuffle=True)
train_dataloader = DataLoader(train_set,batch_size=batch_size,shuffle=True)

In [None]:
len(next(iter(train_dataloader)))

In [None]:
class Net(nn.Module):
    def __init__(self,label_nums):
        super().__init__()
        self.config = BertConfig.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.linear = nn.Linear(self.config.hidden_size,label_nums)
    def forward(self,input_ids,token_type_ids,attention_mask):
        output = self.bert(input_ids,token_type_ids,attention_mask)
        output = output.pooler_output
        logits = self.linear(output)
        return logits
        

In [None]:
model = Net(label_nums)

In [None]:
#只更新最后一层
for name,parameter in model.named_parameters():
    if "bert" in name:
        parameter.requires_grad = False

In [None]:
# for p in model.named_parameters():
#     print(p)

In [None]:
def train(model,dataloader,criterion,lr = 0.01):
    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    
    model.train()
    all_loss = 0.0
    all_acc = 0.0
    for i,data in enumerate(dataloader):
        model.zero_grad()
        input_ids = data[0].to(device)
        token_type_ids = data[1].to(device)
        attention_mask = data[2].to(device)
        labels = data[3].to(device)
        
        logits = model(input_ids,token_type_ids,attention_mask)
        loss = criterion(logits,labels)
        loss.backward()
        optimizer.step()
        
        all_acc += accuracy(logits,labels).item()
        all_loss += loss.item()

    return all_loss/len(dataloader),all_acc/len(dataloader)
        

In [None]:
# train(model,train_dataloader,tokenizer,10) # GPU
# 用时：0.0511s tokenizer用时
# 用时：25.5765s 一次梯度更新用时
# 0 tensor(1.4917, grad_fn=<NllLossBackward>)
# 用时：0.0471s
# 用时：24.5215s
# 用时：0.0503s

# GPU tokenizer反而成了性能瓶颈
# 用时：0.0501s
# 用时：1.0306s
# 0 tensor(1.7268, device='cuda:0', grad_fn=<NllLossBackward>)
# 用时：0.0426s
# 用时：0.0135s 

In [None]:
def accuracy(prediction, label):
    """
    Returns accuracy per batch
    """
    prediction = torch.argmax(nn.functional.softmax(prediction, dim=1), dim=1)
    acc = torch.sum(prediction == label).float() / len(prediction == label)
    return acc

In [None]:
def evaluate(model,dataloader,criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for i,data in enumerate(dataloader):
            input_ids = data[0].to(device)
            token_type_ids = data[1].to(device)
            attention_mask = data[2].to(device)
            
            labels = data[3].to(device)
            
            prediction = model(input_ids,token_type_ids,attention_mask)
            
            loss = criterion(prediction,labels)
            
            acc = accuracy(prediction,labels)
            
            epoch_acc += acc.item()
            epoch_loss += loss.item()
        
    return epoch_loss/len(dataloader),epoch_acc / len(dataloader)

In [None]:
# evaluate(model,test_dataloader,nn.CrossEntropyLoss())

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def main():
    N_EPOCHS = 10
    best_epoch = 0
    best_valid_loss = float('inf')
    best_valid_acc = 0.0
    
    for epoch in range(N_EPOCHS):
        start_time = time.time()
        
        ce_loss = nn.CrossEntropyLoss()
        
        train_loss, train_acc = train(model,train_dataloader,ce_loss)
        valid_loss, valid_acc = evaluate(model, test_dataloader, ce_loss)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            
        if valid_acc > best_valid_acc:
            best_epoch = epoch + 1
            best_valid_acc = valid_acc
            torch.save(model.state_dict(),'model.pt')

        print(f'Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(best_epoch,best_valid_acc)

In [None]:
!nvidia-smi

In [None]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [None]:
main()