<a href="https://colab.research.google.com/github/pds2122/capstone-project-kabobe/blob/main/models/model_zero_shot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ndjson --quiet
!pip install transformers --quiet
!pip install datasets --quiet
!pip install --upgrade simpletransformers --quiet

In [None]:
import ndjson
import pandas as pd
#import gzip
from pathlib import Path
from google.colab import drive
import numpy as np
import gc

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import torch
from datasets import Dataset, DatasetDict, load_metric
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, classification_report

import os
import tarfile

# Load train & test data

In [None]:
# Mount Google Drive
drive.mount('/gdrive')
data_path = Path('/gdrive/MyDrive/industry_data_processed/data/final/')
file_name = 'df_test_de_sentiment.ndjson'

with open(data_path/file_name, 'rt', encoding='UTF-8') as file:
    data = ndjson.load(file)
df_test = pd.DataFrame(data)

Mounted at /gdrive


In [None]:
drive.mount('/gdrive')
data_path = Path('/gdrive/MyDrive/industry_data_processed/data/final/')
file_name = 'df_train_de_sentiment.ndjson'
with open(data_path/file_name, 'rt', encoding='UTF-8') as file: # if gz, use >> with gzip.open()
    data = []
    data = [ndjson.loads(line) for line in file]

# get nested list, create flat-list, to fet data in DataFrame
flat_list = [item for sublist in data for item in sublist]
df_train = pd.DataFrame(flat_list)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Zero shot - German distilBERT

Following the tutorial on: https://www.philschmid.de/bert-text-classification-in-a-different-language

When training zero shot model turn on GPU!

In [None]:
class_list = ['Human Resources', 'Information Technology and Services', 'Automotive',
              'Mechanical or Industrial Engineering', 'Medical Practice', 'Insurance',
              'Construction', 'Management Consulting', 'Consumer Goods', 'Legal Services',
              'Financial Services', 'Wholesale', 'Marketing and Advertising', 'Real Estate', 
              'Telecommunications', 'Logistics and Supply Chain', 'Renewables & Environment',
              'Recreational Facilities and Services', 'Leisure, Travel & Tourism']

df_test['pred_class'] = df_test.apply(lambda x:  class_list.index(x['industry_label']),axis=1)
df_train['pred_class'] = df_train.apply(lambda x:  class_list.index(x['industry_label']),axis=1)

df_test = df_test[['concatenated','pred_class']]
df_train = df_train[['concatenated','pred_class']]

df_train = df_train.rename(columns={'pred_class': 'label', 'concatenated': 'text'})
df_test = df_test.rename(columns={'pred_class': 'label', 'concatenated': 'text'})

In [None]:
# https://simpletransformers.ai/docs/classification-models/
train_args ={'reprocess_input_data': True,
             'fp16':False,
             'num_train_epochs': 5,
             'use_early_stopping': True,
             #'overwrite_output_dir': True # use if already trained once in the same runtime
             }

model_distilBERT = ClassificationModel('bert', 'distilbert-base-german-cased', args=train_args, num_labels=19)

In [None]:
model_distilBERT.train_model(df_train)

In [None]:
def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='None')

result, model_outputs, wrong_predictions = model_distilBERT.eval_model(df_test, f1=f1_multiclass, acc=accuracy_score, recall=recall_score, precision=precision_score)

Results:

* acc = 0.5321581705574083
* eval_loss = 1.9038320411386944
* f1 = 0.5321581705574083
* mcc = 0.4834822098667312

In [None]:
# save model
def pack_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}/{file}')

# run the function
pack_model('/content','distilBERT_german')

drive.mount('/gdrive')
!cp distilBERT_ger.tar.gz '/gdrive/MyDrive/industry_data_processed/models/zero_shot_distilbert'

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Zero shot - using auto model (sequence classification)

This model is not integrated in the app as it scores lower than german distilBERT. It also requires input as a Dataset instead of a Dataframe. This makes the handling more complex in the gradio app. For completeness it is kept in the notebook.

In [None]:
# if executing this after exectung the distilBERT make sure to reload df_train and df_test
num_labels = len(df_test.industry.unique())

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

In [None]:
class_list = ['Human Resources', 'Information Technology and Services', 'Automotive',
              'Mechanical or Industrial Engineering', 'Medical Practice', 'Insurance',
              'Construction', 'Management Consulting', 'Consumer Goods', 'Legal Services',
              'Financial Services', 'Wholesale', 'Marketing and Advertising', 'Real Estate', 
              'Telecommunications', 'Logistics and Supply Chain', 'Renewables & Environment',
              'Recreational Facilities and Services', 'Leisure, Travel & Tourism']

df_test['pred_class'] = df_test.apply(lambda x:  class_list.index(x['industry_label']),axis=1)
df_train['pred_class'] = df_train.apply(lambda x:  class_list.index(x['industry_label']),axis=1)

In [None]:
df_train = df_train[['pred_class', 'concatenated']]
df_test = df_test[['pred_class', 'concatenated']]

In [None]:
# change names
df_train = df_train.rename(columns={'pred_class': 'label', 'concatenated': 'text'})
df_test = df_test.rename(columns={'pred_class': 'label', 'concatenated': 'text'})

In [None]:
# AutoModelForSequenceClassification needs input of type dataset -> convert df to dataset class from huggingface
# https://huggingface.co/docs/datasets/loading_datasets.html
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

final = DatasetDict()

final['train']=dataset_train
final['test']=dataset_test

# use this if train test split still needs to be done
#dataset_train = dataset_train.train_test_split(test_size=0.3)

In [None]:
# following the tutorial from lection 09 (NLP with huggingface)

def tokenize_function(df):
    return tokenizer(df['text'], padding='max_length', truncation=True)

tokenized_datasets = final.map(tokenize_function, batched=True)

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16809
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8396
    })
})

In [None]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000)) 
small_eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000)) 
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['test']

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments('test_trainer', report_to=None)

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

In [None]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset, compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

In [None]:
# https://stackoverflow.com/questions/64663385/saving-and-reload-huggingface-fine-tuned-transformer

model_dir = '/gdrive/MyDrive/industry_data_processed/models'
trainer.save_model(model_dir + 'zero_shot_bert/model')