In [1]:
import torch
import pandas as pd 
import numpy as np 
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer
from torch.nn import functional as F
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [2]:
output_dir = './bert_save_model/'

In [3]:
model = BertForSequenceClassification.from_pretrained(output_dir)

In [4]:
tokenizer = BertTokenizer.from_pretrained(output_dir)

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
model.eval()

In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

In [8]:
def tweet_tokenize(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text.lower())
    return tokens

In [9]:
def preprocess_text(tokens):
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_text = " ".join(tokens)
    processed_text = " ".join(processed_text.split())

    return processed_text

In [10]:
data = pd.read_csv('Davidson_modified.csv')

In [11]:
data['tweet'] = data['tweet'].apply(tweet_tokenize)
data['tweet'] = data['tweet'].apply(preprocess_text)
sentences = data.tweet.values

In [12]:
input_ids = []
attention_masks = []

In [13]:
max_length = 64

In [14]:
for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        truncation = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# Set the batch size.  
batch_size = 16  
# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
model.to(device)
predictions  = []
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)
stack = np.vstack(predictions)
final_preds = F.softmax(torch.from_numpy(stack), dim=1)[:, 1].numpy()
data['predictions'] = final_preds
print(data[['tweet', 'predictions']])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

binary_predictions = (final_preds > 0.5).astype(int)

labels_true = data['class'].values

accuracy = accuracy_score(labels_true, binary_predictions)
f1 = f1_score(labels_true, binary_predictions)
recall = recall_score(labels_true, binary_predictions)
precision = precision_score(labels_true, binary_predictions)

print("Accuracy:", accuracy)
print("F1-score:", f1)
print("Recall:", recall)
print("Precision:", precision)

data['predictions'] = binary_predictions