In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
from bert_codes.feature_generation import combine_features,return_dataloader,return_cnngru_dataloader
from bert_codes.data_extractor import data_collector
from bert_codes.own_bert_models import *
from bert_codes.utils import *
from transformers import *
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [113]:
if torch.cuda.is_available():    
	# Tell PyTorch to use the GPU.    
	device = torch.device("cuda")
	print('There are %d GPU(s) available.' % torch.cuda.device_count())
	print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")



There are 2 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [114]:
torch.cuda.set_device(1)

In [115]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
X = [[1, 2], [3, 4], [1, 2], [3, 4]]
y = [0, 0, 1, 1]
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)


skf=StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


ValueError: Cannot have number of splits n_splits=10 greater than the number of samples: n_samples=4.

In [116]:
params={
    'max_length':128,
    'path_files': '../../multilingual_hatespeech/multilingual_bert',
    'what_bert':'normal',
    'batch_size':32,
    'is_train':True,
    'learning_rate':2e-5,
    'epsilon':1e-8,
    'random_seed':2020,
    'epochs':5

}

In [117]:
def Eval_phase(params,test_loader,which_files='test',model=None):
    model.eval()
    print("Running eval on ",which_files,"...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    # Tracking variables 
    eval_loss=0.0
    true_labels=[]
    pred_labels=[]
    # Evaluate data for one epoch
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Accumulate the total accuracy.
        pred_labels+=list(np.argmax(logits, axis=1).flatten())
        true_labels+=list(label_ids.flatten())

        # Track the number of batches
        nb_eval_steps += 1

    testf1=f1_score(true_labels, pred_labels, average='macro')
    testacc=accuracy_score(true_labels,pred_labels)

    # Report the final accuracy for this validation run.
    print(" Accuracy: {0:.2f}".format(testacc))
    print(" Fscore: {0:.2f}".format(testf1))
    print(" Test took: {:}".format(format_time(time.time() - t0)))
    return testf1,testacc


In [125]:
def cross_validate_bert(params):
    total_data=pd.read_csv('Total_data_annotated.csv')
    all_sentences = total_data.text
    all_labels=total_data.label
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained(params['path_files'], do_lower_case=False)
    input_total_ids,att_masks_total=combine_features(all_sentences,tokenizer,params['max_length'],
                                                     take_pair=False,take_target=False)
    
    ###optimizer
    
        
    skf=StratifiedKFold(n_splits=10, random_state=params['random_seed'], shuffle=False)
    for train_index, test_index in skf.split(input_total_ids, all_labels):
        print("TRAIN:", train_index, "TEST:", test_index)
        input_train_ids,att_masks_train,labels_train=input_total_ids[train_index],att_masks_total[train_index],all_labels[train_index]
        input_val_ids,att_masks_val,labels_val=input_total_ids[test_index],att_masks_total[test_index],all_labels[test_index]
        
        model=select_model(params['what_bert'],params['path_files'])
        model.cuda()
        optimizer = AdamW(model.parameters(),
                      lr = params['learning_rate'], # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps = params['epsilon'] # args.adam_epsilon  - default is 1e-8.
                    )

        
        train_dataloader = return_dataloader(input_train_ids,labels_train,att_masks_train,batch_size=params['batch_size'],is_train=params['is_train'])
        validation_dataloader=return_dataloader(input_val_ids,labels_val,att_masks_val,batch_size=params['batch_size'],is_train=False)
        total_steps = len(train_dataloader) * params['epochs']

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = int(total_steps/10), # Default value in run_glue.py
                                                    num_training_steps = total_steps)

        # Set the seed value all over the place to make this reproducible.
        fix_the_random(seed_val = params['random_seed'])
        # Store the averaggit pull origin master --allow-unrelated-historiese loss after each epoch so we can plot them.
        loss_values = []

        bert_model = params['path_files']
        best_val_fscore=0
        best_test_fscore=0
        for epoch_i in range(0, params['epochs']):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, params['epochs']))
            print('Training...')

            # Measure how long the training epoch takes.
            t0 = time.time()

            # Reset the total loss for this epoch.
            total_loss = 0
            model.train()

            # For each batch of training data...
            for step, batch in tqdm(enumerate(train_dataloader)):

                # Progress update every 40 batches.
                if step % 40 == 0 and not step == 0:
                    # Calculate elapsed time in minutes.
                    elapsed = format_time(time.time() - t0)
                # `batch` contains three pytorch tensors:
                #   [0]: input ids 
                #   [1]: attention masks
                #   [2]: labels 
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                model.zero_grad()        

                outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)

                # The call to `model` always returns a tuple, so we need to pull the 
                # loss value out of the tuple.
                loss = outputs[0]
                # if(params['logging']=='neptune'):
                # 	neptune.log_metric('batch_loss',loss)
                # Accumulate the training loss over all of the batches so that we can
                # calculate the average loss at the end. `loss` is a Tensor containing a
                # single value; the `.item()` function just returns the Python value 
                # from the tensor.
                total_loss += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()
                # Update the learning rate.
                scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss / len(train_dataloader)
            train_fscore,train_accuracy=Eval_phase(params,'train',model)
            print('avg_train_loss',avg_train_loss)
            print('train_fscore',train_fscore)
            print('train_accuracy',train_accuracy)
            # Store the loss value for plotting the learning curve.
            loss_values.append(avg_train_loss)
            val_fscore,val_accuracy=Eval_phase(params,'val',model)		
            #Report the final accuracy for this validation run.
        

In [126]:
sent="I go here"

front_sent=sent[0:2]
back_sent=sent[-2:]
print(front_sent)
print(back_sent)

I 
re


In [127]:
cross_validate_bert(params)

I0426 18:27:41.718567 139787478947648 tokenization_utils.py:335] Model name '../../multilingual_hatespeech/multilingual_bert' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming '../../multilingual_hatespeech/multilingual_bert' is a path, a model identifier, or url to a directory containing tokenizer files.
I0426 18:27:41.719625 139787478947648 tokenization_utils.py:364] Didn't find file ../../multilingual_hatespeech/multilingual_bert/added_tokens.json. We won't load it.
I0426 18:27:

Loading BERT tokenizer...
tokenizing in fear


I0426 18:27:43.344985 139787478947648 configuration_utils.py:231] loading configuration file ../../multilingual_hatespeech/multilingual_bert/config.json
I0426 18:27:43.346127 139787478947648 configuration_utils.py:256] Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "outpu

Input shape before truncating [[101, 57533, 14473, 100, 31984, 10225, 18782, 24800, 10824, 11483, 100, 100, 100, 576, 19713, 13328, 14187, 117, 100, 100, 10944, 70142, 100, 100, 10944, 15721, 11142, 100, 100, 100, 13492, 100, 100, 10944, 26347, 100, 100, 60449, 14187, 117, 100, 100, 11142, 100, 100, 11483, 100, 14187, 119, 119, 119, 100, 100, 554, 21304, 29988, 100, 100, 37955, 100, 100, 100, 15721, 117, 102, 582, 100, 119, 119, 119, 119, 119, 119, 119, 100, 100, 100, 100, 49004, 579, 11231, 25500, 119, 119, 119, 119, 14500, 533, 21377, 41469, 100, 15721, 100, 10944, 53836, 54540, 13142, 119, 119, 136, 136, 136, 136, 100, 100, 17024, 100, 100, 10944, 11384, 67021, 533, 13764, 25695, 11483, 100, 100, 11142, 11384, 100, 100, 11483, 100, 100, 100, 100, 100, 102], [101, 115, 100, 100, 45557, 32152, 12670, 100, 100, 70345, 117, 29931, 100, 100, 117, 46301, 554, 27185, 11059, 100, 115, 100, 100, 117, 45557, 32152, 12670, 100, 100, 100, 100, 100, 100, 11142, 100, 100, 36244, 100, 100, 29931, 

I0426 18:27:48.250554 139787478947648 modeling_utils.py:525] Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
I0426 18:27:48.251491 139787478947648 modeling_utils.py:531] Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
0it [00:00, ?it/s]


Training...


7it [00:04,  1.89it/s]


NameError: name 'which_files' is not defined

In [None]:
total_data=pd.read_csv('Total_data_annotated.csv')

In [None]:
total_data.label.unique()