In [None]:
import subprocess
for package in ["transformers==4.17.0",
"sentencepiece==0.1.97",
"torch==1.11.0+cu113",
"tokenizers==0.12.1",
"spacy==3.2.3",
"scikit-learn==1.1.1",
"pandas==1.3.4",
"numpy==1.20.3",
"nltk==3.6.5",
"matplotlib==3.4.3",
"datasets==2.6.1"]:
  subprocess.run(f"pip install {package}", shell=True)
subprocess.run("python -m spacy download en_core_web_sm", shell=True)


In [1]:
import re
import datetime
import spacy
import math
import random
import json
import hashlib
import datasets


from tqdm import tqdm
from collections import Counter
from typing import Dict, Any

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.tokenization_utils import TruncationStrategy

## bert
from transformers import BertTokenizer, BertModel,BertTokenizerFast
## longformer
from transformers import LongformerModel, LongformerTokenizer,LongformerTokenizerFast

##bigbert
from transformers import BigBirdTokenizer, BigBirdModel,BigBirdTokenizerFast
# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam,AdamW
from torch.utils.data import Dataset, DataLoader

# sklearn
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score,matthews_corrcoef
from sklearn.metrics import confusion_matrix, classification_report


import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

2023-06-08 15:14:21.939226: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/chunbae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chunbae/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/chunbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB


In [5]:
porter = PorterStemmer()

def stem_text_func(text):
    token_words=word_tokenize(text)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

train_df['facts'] =train_df.facts.apply(lambda x : stem_text_func(x))

In [6]:
test_df['facts'] =test_df.facts.apply(lambda x : stem_text_func(x))

In [7]:
def bert_tok_len(text,tokenizer):
  inputs = tokenizer(text, return_tensors="pt",padding=True)
  return len(inputs['input_ids'][0])

In [8]:
def normalized_counter(c: Counter) -> Counter:
    total = sum(c.values(), 0.0)
    for key in c:
        c[key] /= total
    return c

In [9]:
def dict_hash(dictionary: Dict[str, Any]) -> str:
    """MD5 hash of a dictionary."""
    dhash = hashlib.md5()
    encoded = json.dumps(dictionary, sort_keys=True).encode()
    dhash.update(encoded)
    return dhash.hexdigest()

In [10]:
def get_stats(results,epoch_num,origin_labels,predict_outputs,dataset='test'):
    results[epoch_num + 1][dataset] = {}
    results[epoch_num + 1][dataset]['accuracy'] = accuracy_score(origin_labels, predict_outputs)

    results[epoch_num + 1][dataset]['recall_weighted'] = recall_score(origin_labels, predict_outputs, average='weighted')
    results[epoch_num + 1][dataset]['recall_micro'] = recall_score(origin_labels, predict_outputs, average='micro')
    results[epoch_num + 1][dataset]['recall_macro'] = recall_score(origin_labels, predict_outputs, average='macro')

    results[epoch_num + 1][dataset]['precision_weighted'] = precision_score(origin_labels, predict_outputs, average='weighted')
    results[epoch_num + 1][dataset]['precision_micro'] = precision_score(origin_labels, predict_outputs, average='micro')
    results[epoch_num + 1][dataset]['precision_macro'] = precision_score(origin_labels, predict_outputs, average='macro')
    results[epoch_num + 1][dataset]['classification_report'] = classification_report(origin_labels, predict_outputs, target_names=['lose','win'])
    results[epoch_num + 1][dataset]['confusion_matrix'] =  confusion_matrix(origin_labels, predict_outputs).tolist()
    results[epoch_num + 1][dataset]['matthews_corrcoef'] =  matthews_corrcoef(origin_labels, predict_outputs)
    
    return results

In [11]:
def filter_match_len(df_train,df_test,df_val,tokenizer,max_length):
    df_train['num_of_bert_token'] = df_train[TEXT_COL].apply(lambda x : bert_tok_len(x,tokenizer))
    df_test['num_of_bert_token'] = df_test[TEXT_COL].apply(lambda x : bert_tok_len(x,tokenizer))
    df_val['num_of_bert_token'] = df_val[TEXT_COL].apply(lambda x : bert_tok_len(x,tokenizer))
    df_train = df_train[df_train['num_of_bert_token']<=max_length]
    df_test = df_test[df_test['num_of_bert_token']<=max_length]
    df_val = df_val[df_val['num_of_bert_token']<=max_length]
    test = df_test.copy(deep=True)
    train = df_train.copy(deep=True)
    val = df_val.copy(deep=True)
    return train,test,val

In [12]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, label_col, text_col, max_length, tokenizer):
        self.labels = df[label_col].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = max_length, truncation=True,
                                return_tensors="pt") for text in df[text_col]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [13]:
ACTIVATION_FUNCTIONS = {
    'Softmax' :nn.Softmax(),
    'LeakyRelu' : nn.LeakyReLU(),
    'Relu':nn.ReLU(),
      'GELU':nn.GELU()
}

In [14]:
class TextClassifier(nn.Module):

    def __init__(self, model, num_labels, num_features, dropout=0.001,freeze_layers = False,activation_func = 'Softmax'):
      
        super(TextClassifier, self).__init__()

        self.model = model
                
        if freeze_layers:
            for layer in self.model.encoder.layer[:-2]:
                for param in layer.parameters():
                    param.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(num_features, num_labels)
        self.activation_layer = ACTIVATION_FUNCTIONS.get(activation_func)

    def forward(self, input_id, mask):

        _, pooled_output = self.model(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.activation_layer(linear_output)

        return final_layer

In [15]:
from tqdm.auto import tqdm

def train(model_obj, tokenizer, train_data, val_data, test_data, config,parameters,model_num,model_name='',accum_iter = 16,early_stopping = True,weight_decay=0 ):


    the_last_loss = 100
    triggertimes = 0
    patience = 2 if  early_stopping else config['epochs']
    
    
    train, val, test = Dataset(train_data, config['label_col'], config['text_col'], config['max_length'], tokenizer), \
                       Dataset(val_data, config['label_col'], config['text_col'], config['max_length'], tokenizer), \
                       Dataset(test_data, config['label_col'], config['text_col'], config['max_length'], tokenizer)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=config['batch_size'])
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=config['batch_size'])

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = model_obj
    

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=parameters['learning_rate'],weight_decay = parameters['weight_decay'])

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()
    results = {}
    for epoch_num in range(config['epochs']):

            total_acc_train = 0
            total_loss_train = 0
                            
            total_acc_val = 0
            total_loss_val = 0
            
            train_labels = []
            train_outputs = []
            test_labels = []
            test_outputs = []
            val_labels = []
            val_outputs = []

            
            scaler = torch.cuda.amp.GradScaler() 
            
            for batch_idx,( train_text_input, train_label) in  enumerate(tqdm(train_dataloader)):

                train_label = train_label.to(device)
                mask = train_text_input['attention_mask'].to(device)
                input_id = train_text_input['input_ids'].squeeze(1).to(device)
                
                with torch.cuda.amp.autocast(): 
                    output = model(input_id, mask)
                
                    batch_loss = criterion(output, train_label)
                
                    batch_loss = batch_loss / accum_iter
                
                    total_loss_train += batch_loss.item()
                
                    acc = (output.argmax(dim=1) == train_label).sum().item()
                    total_acc_train += acc
                
                    train_labels.append(train_label.cpu().tolist())
                    train_outputs.append(output.argmax(dim=1).cpu().tolist())
                

                if ((batch_idx + 1) % accum_iter == 0) or (batch_idx + 1 == len(train_dataloader)):
                    model.zero_grad()
                    scaler.scale(batch_loss).backward()
                    scaler.step(optimizer)
                    scaler.update()



            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    
                    
                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
                    val_labels.append(val_label.cpu().tolist())
                    val_outputs.append(output.argmax(dim=1).cpu().tolist())


            with torch.no_grad():

                for test_input, test_label in test_dataloader:

                    test_label = test_label.to(device)
                    mask = test_input['attention_mask'].to(device)
                    input_id = test_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    test_labels.append(test_label.cpu().tolist())
                    test_outputs.append(output.argmax(dim=1).cpu().tolist())

        
            val_labels = [item for sublist in val_labels for item in sublist]
            val_outputs = [item for sublist in val_outputs for item in sublist]
            
            test_labels = [item for sublist in test_labels for item in sublist]
            test_outputs = [item for sublist in test_outputs for item in sublist]
            
            train_labels = [item for sublist in train_labels for item in sublist]
            train_outputs = [item for sublist in train_outputs for item in sublist]
            
            
            results[epoch_num + 1] = {'train_loss': total_loss_train / len(train_data),
                                     'val_loss': total_loss_val / len(val_data),
                                     'train_acc': total_acc_train / len(train_data),
                                     'val_acc': total_acc_val / len(val_data)}
            
            
            results = get_stats(results,epoch_num,train_labels,train_outputs,dataset='train')
            results = get_stats(results,epoch_num,val_labels,val_outputs,dataset='val')
            results = get_stats(results,epoch_num,test_labels,test_outputs,dataset='test')


            
            #### Early Stopping ####
            
            current_loss = total_loss_val / len(val_data)
            if current_loss > the_last_loss:
                trigger_times += 1
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience:
                    print('Early Stopping!\nStart to test process.')
                    return epoch_num + 1 ,results

            else:
                print('Trigger Times: 0')
                trigger_times = 0

            the_last_loss = current_loss
            
            
    return epoch_num + 1 ,results


In [16]:
TEXT_COL =   'facts' 
FILTER_LEN = 'truncate'

In [21]:
MAX_LENGTH = 512

models_conf = {
    f'LegalBert_{MAX_LENGTH}_1': 
    {'model': BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased',force_download = True), 
    'tokenizer': BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased'),
    'max_length': [MAX_LENGTH],
    'batch_size': [8]},   
}

params = {
    'freeze':False,
    'truncate_text': FILTER_LEN,  #'truncate','filter_matching_length'                
    'learning_rate': 1e-5,   
    'activation_func': 'Softmax',  
    'mask_entities': False,
    'dropout': 0.001,
    'batch_size': 8,
    'weight_decay':0,  
}

config = {'label_col': 'first_party_winner',
          'text_col': TEXT_COL,
          'epochs':10}

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
def train_and_evaluate(parameters,models_conf,config,base_df):
    
    df_tmp = base_df
    results = {}
    model_num = 0
    params_hash = dict_hash(parameters)


    for model_name, model_objects in models_conf.items():
        results = {}
        for max_length, dataset_batch_size in zip(model_objects['max_length'], model_objects['batch_size']):
            tokenizer = model_objects['tokenizer']
            pretrained_model =model_objects['model']
            
            df = df_tmp.reset_index()
                                    
            model = TextClassifier(model=pretrained_model, num_labels=df[config['label_col']].nunique(), num_features=768,dropout=parameters['dropout'],freeze_layers=parameters['freeze'],activation_func = parameters['activation_func'] )
            
            
            case_ids_train, case_ids_test = train_test_split(
            df.ID.drop_duplicates(), test_size=0.35, stratify=df[['ID',config['label_col']]].drop_duplicates()[config['label_col']], random_state=int(model_name.split('_')[-1]))

            df_test_tmp = df[df.ID.isin(case_ids_test.tolist())]

            case_ids_val, case_ids_test = train_test_split(case_ids_test, test_size=0.5, stratify=df_test_tmp[['ID',config['label_col']]].drop_duplicates()[config['label_col']], random_state=int(model_name.split('_')[-1]))

            df_train = df[df.ID.isin(case_ids_train.tolist())]
            df_test= df[df.ID.isin(case_ids_test.tolist())]
            df_val= df[df.ID.isin(case_ids_val.tolist())]
            
            
            if parameters['truncate_text'] == 'filter_matching_length':
                df_train, df_test, df_val = filter_match_len(df_train,df_test,df_val,tokenizer,max_length)
            
            

            config['max_length']=max_length
            config['batch_size']=dataset_batch_size
            
            # train
            max_epoch,results[f'{model_name}_{max_length}'] = train(model, tokenizer,df_train,df_val, df_test, config,parameters,model_num=model_num,model_name=model_name,accum_iter = parameters['batch_size']/dataset_batch_size)
            
            results[f'{model_name}_{max_length}']['params'] = parameters
            results[f'{model_name}_{max_length}']['label_dist'] = {'train':normalized_counter(Counter(df_train[config['label_col']])),
                                            'test':normalized_counter(Counter(df_test[config['label_col']])),
                                            'val':normalized_counter(Counter(df_val[config['label_col']]))}
            
            best_epoch = config['epochs'] if max_epoch == config['epochs'] else max_epoch-2
            results[f'{model_name}_{max_length}']['best_epoch'] = best_epoch        
            model_num += 1
            

            val = Dataset(df_val, config['label_col'], config['text_col'], config['max_length'],tokenizer)
            val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)    
                                    
            torch.cuda.empty_cache()
            del model
            del tokenizer
            del pretrained_model
           
            print(results[f'{model_name}_{max_length}'][max_epoch]['test']['accuracy'])
            
        print('finished',model_name)
    return results

In [23]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [24]:
import warnings
warnings.filterwarnings('ignore')

model_performance = train_and_evaluate(params,models_conf,config,base_df = train_df)

  0%|          | 0/202 [00:00<?, ?it/s]

Trigger Times: 0


  0%|          | 0/202 [00:00<?, ?it/s]

Trigger Times: 0


  0%|          | 0/202 [00:00<?, ?it/s]

Trigger Times: 1


  0%|          | 0/202 [00:00<?, ?it/s]

Trigger Times: 2
Early Stopping!
Start to test process.
0.5852534562211982
finished LegalBert_512_1
