In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import transformers

import tokenizers
from tqdm import tqdm

import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, log_loss

from scipy.special import softmax

import warnings
warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os 
os.getcwd()

'/home/thanish/Competition/Zindi/Tech4MentalHealth/Notebook'

In [3]:
config = {'train_path' : '../data/train_corrected.csv',
          'test_path' : '../data/test_corrected.csv',
          'train_batch_size' : 4,
          'valid_batch_size' : 8,
          'test_batch_size' : 64,
          'MAX_LEN' : 196,
          'EPOCH' : 3, 
          'BERT_PATH_desktop': '/home/thanish/bert_base_uncased',
          }

TOKENIZER = transformers.BertTokenizer(os.path.join(config['BERT_PATH_desktop'], 'vocab.txt'),lowercase = True)

In [4]:
train_DF = pd.read_csv(config['train_path'])
test_DF = pd.read_csv(config['test_path'])

train_DF.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,i feel that it was better i dream happy,Depression
1,9JDAGUV3,why do i get hallucinations,Drugs
2,419WR1LQ,i am stressed due to lack of financial support...,Depression
3,6UY7DX6Q,why is life important,Suicide
4,FYC0FTFB,how could i be helped to go through the depres...,Depression


In [5]:
# Convert the label to OHE
train_DF = pd.concat([train_DF[['ID', 'text']], pd.get_dummies(train_DF.label)], axis = 1)
train_DF

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,i feel that it was better i dream happy,0,1,0,0
1,9JDAGUV3,why do i get hallucinations,0,0,1,0
2,419WR1LQ,i am stressed due to lack of financial support...,0,1,0,0
3,6UY7DX6Q,why is life important,0,0,0,1
4,FYC0FTFB,how could i be helped to go through the depres...,0,1,0,0
...,...,...,...,...,...,...
611,BOHSNXCN,what should i do to stop alcoholism,1,0,0,0
612,GVDXRQPY,how to become my oneself again,0,0,0,1
613,IO4JHIQS,how can someone stop it,1,0,0,0
614,1DS3P1XO,i feel unworthy,0,1,0,0


In [6]:
np.random.seed(100)
train_local, valid_local = train_test_split(train_DF,
                                            test_size = 0.2,
                                            random_state = 100)

train_local = train_local.reset_index(drop = True)
valid_local = valid_local.reset_index(drop = True)

print(train_local.shape, valid_local.shape)

(492, 6) (124, 6)


In [7]:
class form_input():
    
    def __init__(self, text_id, text, label, data_type = 'test'):
        self.data_type = data_type
        self.text_id = text_id
        self.text = text
        self.label = label
        self.max_len = config['MAX_LEN']
        self.tokenizer = TOKENIZER

        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        inputs =  TOKENIZER.encode_plus(self.text[item])
        
        sub_id = self.text_id[item]
        ids = inputs['input_ids']
        tok_type_id = inputs['token_type_ids']
        att_mask = inputs['attention_mask']
        pad_len = self.max_len - len(ids)

        ids = ids + [0]*pad_len
        tok_type_id = tok_type_id + [0]*pad_len
        att_mask = att_mask + [0]*pad_len
        
        if self.data_type != 'test':
            label = self.label[item]
        else:
            label = 1
        
        return {'sub_id': sub_id,
                #'Actual_text': self.text[item],
                'ids': torch.tensor(ids, dtype = torch.long),
                'mask': torch.tensor(att_mask, dtype = torch.long),
                'token_type_ids': torch.tensor(tok_type_id, dtype = torch.long),
                'targets': torch.tensor(label, dtype = torch.long)}
    

In [8]:
lab_columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']

train_local_data = form_input(train_local.ID, train_local.text, train_local[lab_columns].values, 'train')
valid_local_data = form_input(valid_local.ID, valid_local.text, valid_local[lab_columns].values, 'train')
train_prod_data = form_input(train_DF.ID, train_DF.text, train_DF[lab_columns].values, 'train')
test_prod_data = form_input(test_DF.ID, test_DF.text, None, 'test')

train_local_data[10]

{'sub_id': 'BOHSNXCN',
 'ids': tensor([  101,  2054,  2323,  1045,  2079,  2000,  2644, 25519,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,

In [9]:
train_local_data_loader = DataLoader(train_local_data, 
                                     #shuffle=True,
                                     sampler = RandomSampler(train_local_data),
                                     batch_size=config['train_batch_size'])
valid_local_data_loader = DataLoader(valid_local_data,
                                     #shuffle=True,
                                     sampler = RandomSampler(valid_local_data),
                                     batch_size=config['valid_batch_size'])

train_prod_data_loader = DataLoader(train_prod_data, 
                                    #shuffle=True,
                                    sampler = RandomSampler(train_prod_data),
                                    batch_size=config['train_batch_size'])

test_prod_data_loader = DataLoader(test_prod_data,
                                   #shuffle=False,
                                   sampler = SequentialSampler(test_prod_data),
                                   batch_size=config['test_batch_size'])


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [11]:
class BERTMultiLabelSequenceClassification(torch.nn.Module):
    def __init__(self, num_labels):
        super(BERTMultiLabelSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.xlnet = transformers.BertModel.from_pretrained(config['BERT_PATH_Azure'])
        self.classifier = torch.nn.Linear(768, self.num_labels)
        
    def pool_hidden_state(self, last_hidden_state):
        "Pool the hidden output into a single mean vector"
        last_hidden_state = last_hidden_state[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        return mean_last_hidden_state
        
    def forward(self, input_ids, token_type_ids = None, attention_mask = None, labels = None):
        # Last layer
        last_hidden_state = self.xlnet(input_ids = input_ids, 
                                       token_type_ids = token_type_ids,
                                       attention_mask = attention_mask
                                       )
        # Pooled the outputs in a mean vector
        mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
        logits = self.classifier(mean_last_hidden_state)
        
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.view(-1, self.num_labels))
            return loss, logits
        else:
            return logits
    
        

In [12]:
if torch.cuda.device_count()>1:
    print("It has {} GPUs".format(torch.cuda.device_count()))

    random.seed(100)
    np.random.seed(100)
    torch.manual_seed(100)
    torch.cuda.manual_seed_all(100)
    
    model = BERTMultiLabelSequenceClassification(num_labels = len(lab_columns))
    
    model = nn.DataParallel(model)
    model.to(device)


It has 2 GPUs


In [13]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    
#loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr = 3e-5)

In [14]:
def train_fn(data_loader, model, optimizer):
    model.train()
    
    random.seed(100)
    np.random.seed(100)
    torch.manual_seed(100)
    torch.cuda.manual_seed_all(100)
    
    train_loss  = 0
    for index, dataset in tqdm(enumerate(data_loader), total = len(data_loader)):
        ids = dataset['ids'].to(device, dtype = torch.long)
        mask = dataset['mask'].to(device, dtype = torch.long)
        token_type_ids = dataset['token_type_ids'].to(device, dtype = torch.long)
        target = dataset['targets'].to(device, dtype = torch.float)
    
        output = model(input_ids = ids,
                       token_type_ids = token_type_ids,
                       attention_mask = mask,
                       labels = target
                      )
        
        step_loss = output[0]
         
        step_loss.sum().backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_loss += step_loss
        
    print('Saving the model')
    torch.save(model, '../output/best_Bert_base_model.bin')
    
    print("Avg Train loss" , (train_loss/len(data_loader)))

In [15]:
def eval_fn(data_loader, model):
    model.eval()
    
    eval_loss = 0
    actual_output = []
    predicted_output = []
    with torch.no_grad():
        for index, dataset in tqdm(enumerate(data_loader), total = len(data_loader)):
            ids = dataset['ids'].to(device)
            token_type_ids = dataset['token_type_ids'].to(device)
            mask = dataset['mask'].to(device)
            target = dataset['targets'].to(device, dtype = torch.float)
            
            output = model(input_ids = ids,
                       token_type_ids = token_type_ids,
                       attention_mask = mask,
                       labels = target
                      )
            
            step_loss = output[0]
            prediction = output[1]
            
            eval_loss += step_loss
            
            actual_output.extend(target.detach().cpu().numpy().tolist())
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
        
        print("Avg Eval loss" , (eval_loss/len(data_loader)))
        
        return actual_output, predicted_output
            

In [16]:
EPOCHS = 3

for epoch in range(EPOCHS):
    
    # Training
    train_fn(data_loader = train_prod_data_loader,
             model = model,
             optimizer = optimizer)
    
    # Evaluation
    actual, predicted = eval_fn(data_loader = valid_local_data_loader,
                                model = model)

    actual = np.array(actual)
    #predicted_prob = np.array(predicted)
    predicted_prob = predicted
    predicted_class = np.argmax(np.array(predicted), axis = 1)

#    acc = accuracy_score(actual, predicted_class)
    log_ls = log_loss(actual, torch.tensor(predicted_prob).sigmoid())    
    
#    print("Epoch {}/{} Eval Accuracy: {}, Logloss: {}".format(epoch, EPOCHS, acc, log_ls))
    print("Epoch {}/{} Logloss: {}".format(epoch, EPOCHS, log_ls))

100%|██████████| 154/154 [01:09<00:00,  2.22it/s]


Saving the model


  0%|          | 0/16 [00:00<?, ?it/s]

Avg Train loss tensor([0.2913, 0.2686], device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 16/16 [00:03<00:00,  4.87it/s]
  0%|          | 0/154 [00:00<?, ?it/s]

Avg Eval loss tensor([0.1332, 0.0963], device='cuda:0')
Epoch 0/3 Logloss: 0.257651980305391


100%|██████████| 154/154 [01:08<00:00,  2.26it/s]


Saving the model


  0%|          | 0/16 [00:00<?, ?it/s]

Avg Train loss tensor([0.0807, 0.0752], device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 16/16 [00:03<00:00,  4.89it/s]
  0%|          | 0/154 [00:00<?, ?it/s]

Avg Eval loss tensor([0.0460, 0.0434], device='cuda:0')
Epoch 1/3 Logloss: 0.10625287875412934


100%|██████████| 154/154 [01:08<00:00,  2.26it/s]


Saving the model


  0%|          | 0/16 [00:00<?, ?it/s]

Avg Train loss tensor([0.0405, 0.0363], device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 16/16 [00:03<00:00,  4.95it/s]

Avg Eval loss tensor([0.0389, 0.0428], device='cuda:0')
Epoch 2/3 Logloss: 0.09431236906487855





In [22]:
actual, predicted_class, torch.tensor(predicted_prob).sigmoid()

(array([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0.,

# Test evaluation

In [23]:
model.eval()
actual_output = []
predicted_output = []

submission_ID = []
with torch.no_grad():
    for index, dataset in tqdm(enumerate(test_prod_data_loader), total = len(test_prod_data_loader)):
        sub_id = dataset['sub_id']
        ids = dataset['ids'].to(device)
        token_type_ids = dataset['token_type_ids'].to(device)
        mask = dataset['mask'].to(device)

        output = model(input_ids = ids,
                       token_type_ids = token_type_ids,
                       attention_mask = mask)
        
        submission_ID.extend(sub_id)
        predicted_output.extend(output.sigmoid().detach().cpu().numpy().tolist())
    predicted_output = np.array(predicted_output)
            
        #predicted_output.extend(output.sigmoid().detach().cpu().numpy().tolist())

100%|██████████| 5/5 [00:04<00:00,  1.22it/s]


In [24]:
final_output = pd.DataFrame(predicted_output)
final_output.columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
final_output['ID'] = submission_ID

final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.840222,0.017918,0.178331,0.000914
1,03BMGTOK,0.996217,0.005663,0.005181,0.004419
2,03LZVFM6,0.996504,0.005618,0.004886,0.004279
3,0EPULUM5,0.996423,0.005957,0.004646,0.004397
4,0GM4C5GD,0.022224,0.977285,0.004340,0.009564
...,...,...,...,...,...
304,Z9A6ACLK,0.919801,0.016678,0.022881,0.005580
305,ZDUOIGKN,0.993578,0.011853,0.005127,0.003320
306,ZHQ60CCH,0.833529,0.033575,0.066997,0.001984
307,ZVIJMA4O,0.003794,0.588533,0.001981,0.378880
