In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from bert_codes.feature_generation import combine_features,return_dataloader,return_dataloader_inference,return_cnngru_dataloader
from bert_codes.data_extractor import data_collector
from bert_codes.own_bert_models import *
from bert_codes.utils import *
from transformers import *
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from sklearn.metrics import accuracy_score,f1_score
from utils_function import mapping_to_actual_data

In [5]:
if torch.cuda.is_available():    
	# Tell PyTorch to use the GPU.    
	device = torch.device("cuda")
	print('There are %d GPU(s) available.' % torch.cuda.device_count())
	print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")



There are 2 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [6]:
torch.cuda.set_device(1)

In [7]:
params={
    'max_length':128,
    'path_files': 'models_saved/mbert_fearspeech/',
    'what_bert':'normal',
    'batch_size':32,
    'is_train':True,
    'learning_rate':2e-5,
    'epsilon':1e-8,
    'random_seed':2020,
    'weights':[1.0,9.0],
    'epochs':5

}

In [45]:
def BERT_for_inference(params,total_data=None):
    tokenizer = BertTokenizer.from_pretrained(params['path_files'], do_lower_case=False)
    model=select_model(params['what_bert'],params['path_files'],params['weights'])
    model.cuda()
    model.eval()
    all_sentences = total_data.message_text    
    input_total_ids,att_masks_total=combine_features(all_sentences,tokenizer,params['max_length'],
                                                     take_pair=False,take_target=False)
    train_dataloader = return_dataloader_inference(input_total_ids,att_masks_total,batch_size=params['batch_size'],is_train=False)
    
    softmax = nn.Softmax(dim=1)
    
    pred_labels=[]
    pred_probab=[]
    
    for step, batch in tqdm(enumerate(train_dataloader)):
        # Add batch to GPU
        t0 = time.time()

        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        logits = softmax(logits)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        # Accumulate the total accuracy.
        pred_labels+=list(np.argmax(logits, axis=1).flatten())
        pred_probab+=list([ele[1] for ele in logits])
        
        # Track the number of batches

    # Report the final accuracy for this validation run.
    total_data['preds']=pred_labels
    total_data['pred_probab']=pred_probab
    print(" Test took: {:}".format(format_time(time.time() - t0)))
    return total_data

In [51]:
total_data=pd.read_pickle('../../Data/data_to_be_annotated.pkl')

In [52]:
total_data_filter=total_data[total_data['keywords_count']>1]
total_data_left=total_data[total_data['keywords_count']<=1]

In [54]:
len(total_data_filter)

29091

In [55]:
total_dataframe=BERT_for_inference(params,total_data_filter)

I0427 18:42:03.274488 140362865473344 tokenization_utils.py:335] Model name 'models_saved/mbert_fearspeech/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming 'models_saved/mbert_fearspeech/' is a path, a model identifier, or url to a directory containing tokenizer files.
I0427 18:42:03.275853 140362865473344 tokenization_utils.py:364] Didn't find file models_saved/mbert_fearspeech/added_tokens.json. We won't load it.
I0427 18:42:03.276958 140362865473344 tokenization_utils.py:416]

tokenizing in fear


0it [00:00, ?it/s]

Input shape before truncating [[101, 100, 100, 100, 100, 70142, 100, 100, 100, 10944, 100, 100, 591, 100, 100, 100, 70142, 100, 100, 100, 100, 10944, 100, 100, 102, 53836, 11354, 591, 100, 100, 567, 54060, 16129, 100, 100, 100, 100, 100, 100, 100, 100, 100, 15259, 100, 100, 100, 100, 10944, 100, 100, 591, 102], [101, 20696, 100, 11677, 31519, 142, 10534, 142, 11291, 10285, 142, 12873, 100, 118, 84068, 27577, 96807, 11551, 100, 577, 37120, 15678, 100, 100, 102, 573, 10824, 117, 100, 100, 17553, 142, 10171, 142, 22722, 100, 577, 11231, 11483, 100, 100, 100, 13492, 557, 10949, 33912, 15674, 100, 100, 102], [101, 14372, 100, 11384, 533, 13764, 25695, 100, 100, 100, 100, 19294, 100, 569, 47786, 10949, 21426, 100, 30441, 67715, 10855, 85066, 569, 47786, 11231, 117, 81508, 39509, 100, 547, 37107, 10949, 14500, 107, 571, 11231, 23562, 11483, 576, 68309, 566, 100, 107, 591, 88607, 47064, 11354, 37208, 100, 100, 591, 14965, 64581, 142, 12828, 142, 10171, 142, 150, 102, 61998, 10123, 118, 19452, 

32it [00:04,  6.43it/s]

 Test took: 0:00:00





Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,group_id_anonymized,lang,message_text,phone_num_anonymized,timestamp,translated,preds,pred_probab
0,0,0,0,4.0,4.0,577,hi,*लघु सिंचाई* निःशुल्क बोरिंग योजना हेतु 55 कर...,178320,1549559608000,* Provision of Rs. 55 crore for minor irrigati...,0,0.00512
1,1,1,1,9.0,9.0,2037,hi,*📰FR62* *👉दिल्ली-एनसीआर में शिमला जैसी बर्फबार...,39877,1549559738000,* 📰 📰FR62 👉 * * Shimla-like snowfall in Delh...,0,0.004702
2,2,2,2,10.0,10.0,3634,hi,कल मैने एक आदमी से लिफ्ट माँगा उसके बाद धन्यवा...,198635,1549559843000,"Yesterday I asked for a lift from a man, after...",1,0.980105
3,3,3,3,11.0,11.0,2284,hi,कल मैने एक आदमी से लिफ्ट माँगा उसके बाद धन्यवा...,198635,1549559851000,"Yesterday I asked for a lift from a man, after...",1,0.980105
4,4,4,4,13.0,13.0,3700,te,సర్వేలు సూసి మురిసి పోవద్దు మిత్రులు బాబు జిత్...,153553,1549559896000,Surveys Don't Surprise Friends,0,0.015361


In [None]:
total_data_to_annnotate=pd.read_pickle('../../Data/data_to_be_annotated.pkl')
actual_data=pd.read_csv('../../Data/new_data_lang_without_spam_translated.csv')