In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from bert_codes.feature_generation import combine_features,return_dataloader,return_dataloader_inference,return_cnngru_dataloader
from bert_codes.data_extractor import data_collector
from bert_codes.own_bert_models import *
from bert_codes.utils import *
from transformers import *
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from sklearn.metrics import accuracy_score,f1_score
from utils_function import mapping_to_actual_data

In [5]:
if torch.cuda.is_available():    
	# Tell PyTorch to use the GPU.    
	device = torch.device("cuda")
	print('There are %d GPU(s) available.' % torch.cuda.device_count())
	print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")



There are 2 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [6]:
torch.cuda.set_device(1)

In [7]:
params={
    'max_length':128,
    'path_files': 'models_saved/mbert_fearspeech/',
    'what_bert':'normal',
    'batch_size':32,
    'is_train':True,
    'learning_rate':2e-5,
    'epsilon':1e-8,
    'random_seed':2020,
    'weights':[1.0,9.0],
    'epochs':5

}

In [71]:
def BERT_for_inference(params,total_data=None):
    tokenizer = BertTokenizer.from_pretrained(params['path_files'], do_lower_case=False)
    model=select_model(params['what_bert'],params['path_files'],params['weights'])
    model.cuda()
    model.eval()
    all_sentences = total_data.text    
    input_total_ids,att_masks_total=combine_features(all_sentences,tokenizer,params['max_length'],
                                                     take_pair=False,take_target=False)
    train_dataloader = return_dataloader_inference(input_total_ids,att_masks_total,batch_size=params['batch_size'],is_train=False)
    
    softmax = nn.Softmax(dim=1)
    
    pred_labels=[]
    pred_probab=[]
    
    for step, batch in tqdm(enumerate(train_dataloader)):
        # Add batch to GPU
        t0 = time.time()

        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        logits = softmax(logits)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        # Accumulate the total accuracy.
        pred_labels+=list(np.argmax(logits, axis=1).flatten())
        pred_probab+=list([ele[1] for ele in logits])
        
        # Track the number of batches

    # Report the final accuracy for this validation run.
    total_data['preds']=pred_labels
    total_data['pred_probab']=pred_probab
    print(" Test took: {:}".format(format_time(time.time() - t0)))
    return total_data

In [72]:
total_data=pd.read_pickle('../../Data/data_to_be_annotated.pkl')

In [73]:
total_data_filter=total_data[total_data['keywords_count']>1]
total_data_left=total_data[total_data['keywords_count']<=1]

In [74]:
len(total_data_filter)

29091

In [75]:
total_dataframe=BERT_for_inference(params,total_data_filter)

I0427 18:52:50.893791 140362865473344 tokenization_utils.py:335] Model name 'models_saved/mbert_fearspeech/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming 'models_saved/mbert_fearspeech/' is a path, a model identifier, or url to a directory containing tokenizer files.
I0427 18:52:50.895341 140362865473344 tokenization_utils.py:364] Didn't find file models_saved/mbert_fearspeech/added_tokens.json. We won't load it.
I0427 18:52:50.896470 140362865473344 tokenization_utils.py:416]

tokenizing in fear
Input shape before truncating [[101, 100, 11483, 100, 569, 31614, 35877, 44779, 10390, 142, 21809, 566, 15807, 11231, 100, 117, 102, 100, 100, 21426, 100, 100, 100, 100, 100, 579, 11231, 73893, 13610, 534, 45498, 11354, 100, 119, 102], [101, 84859, 100, 85234, 570, 13560, 27185, 100, 29416, 64188, 52557, 100, 100, 170, 100, 100, 100, 11263, 119, 119, 119, 119, 119, 119, 119, 119, 119, 100, 87410, 100, 170, 170, 100, 100, 100, 100, 100, 118, 100, 11142, 73349, 49946, 16985, 11263, 170, 100, 553, 91680, 10824, 95691, 37120, 10949, 100, 100, 100, 576, 72292, 52460, 10944, 577, 16985, 11263, 170, 170, 102, 100, 100, 11263, 11263, 100, 100, 170, 100, 85234, 570, 13560, 27185, 100, 29416, 64188, 52557, 100, 100, 170, 170, 100, 100, 100, 100, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 15513, 100, 100, 83329, 37524, 100, 170, 24137, 100, 100, 100, 48413, 119, 119, 119, 119, 119, 100, 85234, 54022, 10949, 100, 170, 170, 100, 100, 100, 11263, 102], [101, 100, 47505, 707

910it [02:23,  6.33it/s]


 Test took: 0:00:05


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [76]:
total_data_left['preds']=list(np.zeros((len(total_data_left)),dtype=int))
total_data_left['pred_probab']=list(np.zeros((len(total_data_left)),dtype=float))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [77]:
temp=pd.concat([total_dataframe,total_data_left],axis=0)

In [78]:
total_data_

NameError: name 'total_data_' is not defined

In [None]:
total_data_to_annnotate=pd.read_pickle('../../Data/data_to_be_annotated.pkl')
actual_data=pd.read_csv('../../Data/new_data_lang_without_spam_translated.csv')