In [1]:
%load_ext autoreload
%autoreload 2

In [89]:
from bert_codes.feature_generation import combine_features,return_dataloader,return_dataloader_inference,return_cnngru_dataloader
from bert_codes.data_extractor import data_collector
from bert_codes.own_bert_models import *
from bert_codes.utils import *
from transformers import *
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from sklearn.metrics import accuracy_score,f1_score
from utils_function import mapping_to_actual_data

In [5]:
if torch.cuda.is_available():    
	# Tell PyTorch to use the GPU.    
	device = torch.device("cuda")
	print('There are %d GPU(s) available.' % torch.cuda.device_count())
	print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")



There are 2 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [6]:
torch.cuda.set_device(1)

In [7]:
params={
    'max_length':128,
    'path_files': 'models_saved/mbert_fearspeech/',
    'what_bert':'normal',
    'batch_size':32,
    'is_train':True,
    'learning_rate':2e-5,
    'epsilon':1e-8,
    'random_seed':2020,
    'weights':[1.0,9.0],
    'epochs':5

}

In [71]:
def BERT_for_inference(params,total_data=None):
    tokenizer = BertTokenizer.from_pretrained(params['path_files'], do_lower_case=False)
    model=select_model(params['what_bert'],params['path_files'],params['weights'])
    model.cuda()
    model.eval()
    all_sentences = total_data.text    
    input_total_ids,att_masks_total=combine_features(all_sentences,tokenizer,params['max_length'],
                                                     take_pair=False,take_target=False)
    train_dataloader = return_dataloader_inference(input_total_ids,att_masks_total,batch_size=params['batch_size'],is_train=False)
    
    softmax = nn.Softmax(dim=1)
    
    pred_labels=[]
    pred_probab=[]
    
    for step, batch in tqdm(enumerate(train_dataloader)):
        # Add batch to GPU
        t0 = time.time()

        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        logits = softmax(logits)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        # Accumulate the total accuracy.
        pred_labels+=list(np.argmax(logits, axis=1).flatten())
        pred_probab+=list([ele[1] for ele in logits])
        
        # Track the number of batches

    # Report the final accuracy for this validation run.
    total_data['preds']=pred_labels
    total_data['pred_probab']=pred_probab
    print(" Test took: {:}".format(format_time(time.time() - t0)))
    return total_data

In [72]:
total_data=pd.read_pickle('../../Data/data_to_be_annotated.pkl')

In [95]:
total_data_filter=total_data[total_data['keywords_count']>=1]
total_data_left=total_data[total_data['keywords_count']<1]

In [96]:
len(total_data_filter)

47748

In [97]:
total_dataframe=BERT_for_inference(params,total_data_filter)

I0427 19:34:56.639440 140362865473344 tokenization_utils.py:335] Model name 'models_saved/mbert_fearspeech/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming 'models_saved/mbert_fearspeech/' is a path, a model identifier, or url to a directory containing tokenizer files.
I0427 19:34:56.641127 140362865473344 tokenization_utils.py:364] Didn't find file models_saved/mbert_fearspeech/added_tokens.json. We won't load it.
I0427 19:34:56.642192 140362865473344 tokenization_utils.py:416]

tokenizing in fear
Input shape before truncating [[101, 100, 100, 100, 100, 100, 11483, 70843, 564, 35877, 102, 564, 117, 577, 90471, 100, 100, 571, 11231, 23562, 118, 23507, 38688, 100, 100, 102], [101, 100, 11483, 100, 100, 100, 100, 100, 11263, 569, 73183, 10949, 100, 571, 11231, 23562, 100, 100, 31984, 100, 100, 11483, 100, 37208, 13901, 100, 100, 45729, 42516, 12334, 100, 571, 11231, 23562, 100, 95358, 39509, 11483, 100, 100, 73349, 13060, 100, 95200, 118, 100, 100, 11263, 36533, 21671, 13328, 100, 21426, 532, 78135, 566, 13060, 14870, 102, 100, 100, 100, 100, 12334, 100, 100, 58106, 100, 100, 11142, 29521, 100, 95200, 118, 100, 100, 100, 170, 11677, 31519, 142, 10534, 142, 11291, 10285, 142, 12873, 44779, 10390, 142, 21809, 100, 100, 100, 44779, 10390, 142, 21809, 100, 100, 10944, 100, 18262, 100, 100, 102], [101, 100, 117, 100, 117, 100, 100, 100, 100, 20574, 100, 100, 102, 564, 532, 14256, 21310, 11483, 100, 100, 11263, 100, 119, 100, 15807, 100, 102], [101, 19142, 100, 100, 19

1493it [03:56,  6.33it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 Test took: 0:00:05


In [98]:
total_data_left['preds']=list(np.zeros((len(total_data_left)),dtype=int))
total_data_left['pred_probab']=list(np.zeros((len(total_data_left)),dtype=float))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [99]:
temp=pd.concat([total_dataframe,total_data_left],axis=0,sort=True)

In [100]:
temp.head(5)

Unnamed: 0_level_0,keywords_count,pred_probab,preds,repeated messages,text,three annotator,translated
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
24,1,0.01063,0,[14967],"दिग्गज BJP ऩेता ने मुस्लिम को बनाया दामाद, ...","[17c0af28-67ba-4d4d-a42d-75c12a47484c, 4af38fe...","Veteran BJP leader made Muslim son-in-law, Mod..."
176,1,0.377079,0,"[1416733, 1593926]",14 फ़रवरी को पुलवामा में हुए आतंकी हमले का बदल...,"[35a4bb0c-447e-4aaf-9143-c6641041513f, 9d5aea6...","On February 14, Narendra Modi took the revenge..."
183,1,0.006503,0,[1593860],": अमेरिका, ब्रिटेन, फ्रांस ने UN में दिया आतं...","[3dd88c74-f576-4c4e-a8f2-74621846366e, c99e0d4...",": America, Britain, France gave a proposal to ..."
201,1,0.118407,0,[276271],": भारतीय एक्शन के बाद PAK पर सख्त अमेरिका, कह...","[c44cc375-b7ea-4371-bc00-716fda661aa7, a178c3a...",": Strict America on PAK after Indian action, s..."
371,1,0.010218,0,[616176],Launching Ceremony of Quranic Encyclopedia ht...,"[b3a926b5-d17e-4256-946f-99213973ce2b, ffdca11...",Launching Ceremony of Quranic Encyclopedia ht...


In [101]:
#total_data_to_annnotate=pd.read_pickle('../../Data/data_to_be_annotated.pkl')
actual_data=pd.read_csv('../../Data/new_data_lang_without_spam_translated.csv')

In [102]:
temp2=mapping_to_actual_data(temp,actual_data)

100%|██████████| 975989/975989 [01:23<00:00, 11707.82it/s]


[(0, 0, 0.0), (1, 0, 0.0), (2, 0, 0.0), (3, 0, 0.0), (4, 0, 0.0)]


In [104]:
len(temp2[temp2['pred_probabrobab']>0.8])

23232

In [106]:
temp2.to_csv('../../Data/new_data_lang_without_spam_translated_BERT_pred.csv',index=False)