In [1]:
import pandas as pd
import numpy as np
import nltk
import string
#import fasttext
#import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
from snownlp import SnowNLP

In [3]:
text = SnowNLP(u'口感很好，喝起来味道不错，包装也很精美，送人也很大气。')
sent = text.sentences
for sen in sent:
    s = SnowNLP(sen)
    print(s.sentiments)

0.9052074671668017
0.7045793841375075
0.8142993898625756
0.7122102717678036


In [4]:
print(s.words)

['送', '人', '也', '很', '大', '气']


In [5]:
list(s.tags)

[('送', 'v'), ('人', 'n'), ('也', 'd'), ('很', 'd'), ('大', 'a'), ('气', 'n')]

In [6]:
#plt.xticks(rotation=70)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

In [7]:
train = pd.read_csv("teapro.csv", encoding="GBK")

In [8]:
print(train)

        no                  rateContent  package  quality  price  service  \
0        1  口感很好，喝起来味道不错，包装也很精美，送人也很大气。        1        1      0        0   
1        2  送朋友的，如果里盒不破就更好了，细节也很重要哦，谢谢！        1        0      0        0   
2        3            茶的味道很纯正，使用方便，包装很好        1        1      0        0   
3        4              茶叶不错，味道挺好的，5分好评        0        1      0        0   
4        5                     口感特别好~~！        0        1      0        0   
...    ...                          ...      ...      ...    ...      ...   
3842  3843                很好，很新鲜，不错，好评！        0        0      0        0   
3843  3844               不错，是正品，下次还会再来。        0        1      0        0   
3844  3845      老板态度好，发货及时，茶叶很好，口感很好，甘甜        0        1      0        1   
3845  3846                     哎，，，，，，，        0        0      0        0   
3846  3847                       味道真的不错        0        1      0        0   

      logistics  other  sentiment  
0             0      0          0  
1  

In [9]:
#pip install transformers

In [10]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [11]:
len(tokenizer.vocab)

21128

In [12]:
print(train.loc[:,"rateContent"])

0       口感很好，喝起来味道不错，包装也很精美，送人也很大气。
1       送朋友的，如果里盒不破就更好了，细节也很重要哦，谢谢！
2                 茶的味道很纯正，使用方便，包装很好
3                   茶叶不错，味道挺好的，5分好评
4                          口感特别好~~！
                   ...             
3842                  很好，很新鲜，不错，好评！
3843                 不错，是正品，下次还会再来。
3844        老板态度好，发货及时，茶叶很好，口感很好，甘甜
3845                       哎，，，，，，，
3846                         味道真的不错
Name: rateContent, Length: 3847, dtype: object


In [13]:
training_data_list = []
print(train.loc[:,"rateContent"][0])

口感很好，喝起来味道不错，包装也很精美，送人也很大气。


In [14]:
print(len(train.loc[:,"rateContent"]))

3847


In [15]:
for i in range(len(train.loc[:,"rateContent"])-100):
    training_data_list.append(train.loc[:,"rateContent"][i])

In [16]:
vali_data_list = []
for j in range(len(train.loc[:,"rateContent"])-100,len(train.loc[:,"rateContent"])):
    vali_data_list.append(train.loc[:,"rateContent"][j])

In [17]:
label_list = []
for i in range(len(train.loc[:,"rateContent"])-100):
    label_list.append(train.loc[:,"package"][i])

In [18]:
vali_label_list = []
for j in range(len(train.loc[:,"rateContent"])-100,len(train.loc[:,"rateContent"])):
    vali_label_list.append(train.loc[:,"package"][j])

In [19]:
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            sent, # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=64,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [20]:
train_inputs, train_masks = preprocessing_for_bert(training_data_list)
val_inputs, val_masks = preprocessing_for_bert(vali_data_list)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [21]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(label_list)
val_labels = torch.tensor(vali_label_list)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [22]:
import torch
import torch.nn as nn
from transformers import BertModel


class RNN1(nn.Module):

    def __init__(self, freeze_bert=False ):
      
        super(RNN1, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 8

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            #BiLSTM
            # nn.Linear(2*H, H),
            #LSTM
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )
    
        self.bilstm = nn.LSTM(D_in, H, batch_first = False, bidirectional=True)

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
  
        

        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        a = outputs[0].tolist()
        #print("size out of bert:", np.array(a).shape)

        output =  self.bilstm(outputs[0])
        #print("output of BiLSTM ",len(list(outputs[0])))
         # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = output[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [23]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    RNN1 = RNN1(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    RNN1.to(device)

    # Create the optimizer
    optimizer = AdamW(RNN1.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return RNN1, optimizer, scheduler

In [24]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [25]:
if torch.cuda.is_available():       
    device = torch.device("cuda:0")


In [26]:
set_seed(42)    # Set seed for reproducibility
RNN1, optimizer, scheduler = initialize_model(epochs=2)
train(RNN1, train_dataloader, val_dataloader, epochs=2, evaluation=True)

UnboundLocalError: local variable 'RNN1' referenced before assignment