In [6]:
import torch
import pandas as pd
import numpy as np

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('MyModel_BERT.model', map_location=torch.device('cpu')))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
df = pd.read_csv('data/final_df.csv')
df.head()

Unnamed: 0,message,actual,source,tbdf
0,I noticed that Prepared Statements seem to be ...,0,msr,none
1,"Yes, prepared statements are on my todo list. ...",1,msr,entitlement
2,The only downside with the SQL-based approach ...,0,msr,none
3,Prepared statements use a range of additional ...,0,msr,none
4,How does https://github.com/sidorares/nodejs-m...,1,msr,impatience


In [None]:
def test_model(dataloader_test):

    model.eval()
    
    predictions = []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'labels':         batch[2],
                }  
        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    
    predictions = np.concatenate(predictions, axis=0)
    return predictions

In [None]:
def detection(df):

    encoded_test_val = tokenizer.batch_encode_plus(
     df.message.values,
     add_special_tokens=True,
     return_attention_mask=True,
     pad_to_max_length=True,
     max_length=512,
     return_tensors='pt'
    )

    input_ids_test = encoded_test_val['input_ids']
    attention_masks_test = encoded_test_val['attention_mask']
    labels_test = torch.tensor(df.actual.values)

    dataset_test = TensorDataset(input_ids_test, attention_masks_test,labels_test)
    batch_size = 8
    dataloader_test = DataLoader(dataset_test,
                                    sampler=SequentialSampler(dataset_test),
                                    batch_size=batch_size)


    pred_test = test_model(dataloader_test)
    preds_flat_test = np.argmax(pred_test, axis=1).flatten()
    return preds_flat_test

In [None]:
pred_by_refined_model = detection(df)

pred_by_refined_model_df = pd.DataFrame()
pred_by_refined_model_df['message'] = df['message']
pred_by_refined_model_df['pred_by_refined_model'] = pred_by_refined_model
pred_by_refined_model['actual'] = df['actual']

display(pred_by_refined_model_df.head())

pred_by_refined_model_df.to_csv('data/pred_by_refined_model.csv', index=False)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(df['actual'], pred_by_refined_model))

report_df = pd.DataFrame(classification_report(df['actual'], pred_by_refined_model, output_dict=True)).transpose()

report_df.to_csv('data/report_by_refined_model.csv', index=False)