In [17]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch
# from sklearn.model_selection import train_test_split
import evaluate
import json
import datasets
from datasets import load_dataset
import html

In [9]:
def dataset():
    with open("reddit_dataset.json", "r") as f:
        data = json.load(f)
        
        
    sentences = []
    labels = []
    # dataset = {}
    for x in data:
        # print(x)
        sentences.append(x['parent_body']+" "+x['body'])
        labels.append(x['topic'])
        
    return sentences, labels

In [10]:
topic_to_label_map={
    'Education': 0,
    'Politics': 1,
    'Healthcare': 2,
    'Environment': 3,
    'Technology': 4,
    'unknown': 5
    }

sentences, labels = dataset()
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def make_dataset():
  ds = pd.DataFrame(list(zip(sentences,labels)), columns=['sentence','label'])
  ds_education = ds[ds['label'] == 'Education']
  ds_politics = ds[ds['label'] == 'Politics']
  ds_healthcare = ds[ds['label'] == 'Healthcare']
  ds_environment = ds[ds['label'] == 'Environment']
  ds_technology = ds[ds['label'] == 'Technology']
  ds_unknown = ds[ds['label'] == 'unknown']

  ds_education_test = ds_education.sample(1000)
  ds_politics_test = ds_politics.sample(1000)
  ds_healthcare_test = ds_healthcare.sample(1000)
  ds_environment_test = ds_environment.sample(1000)
  ds_technology_test = ds_technology.sample(1000)
  ds_unknown_test = ds_unknown.sample(1000)

  ds_education_train = ds_education[~(ds_education.index.isin(ds_education_test.index))]
  ds_politics_train = ds_politics[~(ds_politics.index.isin(ds_politics_test.index))]
  ds_healthcare_train = ds_healthcare[~(ds_healthcare.index.isin(ds_healthcare_test.index))]
  ds_environment_train = ds_environment[~(ds_environment.index.isin(ds_environment_test.index))]
  ds_technology_train = ds_technology[~(ds_technology.index.isin(ds_technology_test.index))]
  ds_unknown_train = ds_unknown[~(ds_unknown.index.isin(ds_unknown_test.index))]

  ds_train = pd.concat([ds_education_train, ds_politics_train, ds_healthcare_train, ds_environment_train, ds_technology_train, ds_unknown_train], ignore_index=True)
  ds_test = pd.concat([ds_education_test, ds_politics_test, ds_healthcare_test, ds_environment_test, ds_technology_test, ds_unknown_test], ignore_index=True)


  ds_train['label'] = [topic_to_label_map[x] for x in ds_train['label']]
  ds_test['label'] = [topic_to_label_map[x] for x in ds_test['label']]

  ds_train = ds_train.sample(frac=1)
  ds_test = ds_test.sample(frac=1)

  X_train, y_train = ds_train['sentence'].tolist(), ds_train['label'].tolist()
  X_test, y_test = ds_test['sentence'].tolist(), ds_test['label'].tolist()

#   X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.2)

  
  # train_encodings = tokenizer(X_train, padding="max_length", truncation=True)
  # val_encodings = tokenizer(X_val, padding="max_length", truncation=True)
  # test_encodings = tokenizer(X_test, padding="max_length", truncation=True)

  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = make_dataset()

In [11]:
train_dataset = datasets.Dataset.from_dict({'text':X_train,'labels':y_train})
val_dataset = datasets.Dataset.from_dict({'text':X_test,'labels':y_test})

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [14]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model=model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [18]:
def tokenization(batched_text):
    batched_text['text'] = [html.unescape(o).replace("\r","").replace("\t","").replace("\n","").replace("\\","") for o in batched_text["text"]]
    result = tokenizer(batched_text['text'], padding = "max_length", truncation=True, max_length=16, return_overflowing_tokens=True, return_tensors='pt')
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in batched_text.items():
        result[key] = [values[i] for i in sample_map]
    return result 

In [19]:
train_dataset = train_dataset.map(tokenization, batched='True', remove_columns=['text'])
val_dataset = val_dataset.map(tokenization, batched='True', remove_columns=['text'])

Map:   0%|          | 0/89868 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [22]:
train_dataset = train_dataset.remove_columns(['text'])
val_dataset = val_dataset.remove_columns(['text'])

In [23]:
def preprocess_logits_for_metrics(logits, labels):
  logits = torch.argmax(logits, dim=-1)
  return logits

In [24]:
acc = evaluate.load("accuracy")
prec = evaluate.load("precision")
rec = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions = np.argmax(logits, axis=-1)
    return {'accuracy':acc.compute(predictions=predictions, references=labels), 'precision':prec.compute(predictions=predictions, references=labels, average='weighted')}

In [25]:
training_args = TrainingArguments(
    output_dir="topic_classifier_large",
#     overwrite_output_dir=True,                              
    max_steps=int(1203121*1/512),
    per_device_train_batch_size=512,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    logging_strategy = 'steps',
    save_strategy='no',
    gradient_accumulation_steps=30,
    logging_steps=340,
#     save_steps=1320,
    eval_steps=340,
    fp16=True,
    fp16_full_eval=False,
    learning_rate=1e-3,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    disable_tqdm=False,
    push_to_hub=True,
    hub_strategy="end",
    hub_token="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"
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/sentientconch/topic_classifier_large into local empty directory.


In [27]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision
340,0.7823,0.798974,{'accuracy': 0.756439923712651},{'precision': 0.7651589363456403}
680,0.3076,0.912868,{'accuracy': 0.7768213604577241},{'precision': 0.7817739819232371}
1020,0.1521,1.09699,{'accuracy': 0.786141131595677},{'precision': 0.7874208106676195}
1360,0.0772,1.337041,{'accuracy': 0.790794659885569},{'precision': 0.791876395754874}
1700,0.0467,1.516505,{'accuracy': 0.7934774316592499},{'precision': 0.794922506216383}
2040,0.0345,1.634045,{'accuracy': 0.7948251748251748},{'precision': 0.7963014349393203}


Trainer is attempting to log a value of "{'accuracy': 0.756439923712651}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7651589363456403}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7768213604577241}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7817739819232371}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.786141131595677}"

TrainOutput(global_step=2349, training_loss=0.20676038335261826, metrics={'train_runtime': 5937.9727, 'train_samples_per_second': 6076.256, 'train_steps_per_second': 0.396, 'total_flos': 1.493608112869369e+17, 'train_loss': 0.20676038335261826, 'epoch': 29.99})

In [28]:
model.eval()
trainer.evaluate()

Trainer is attempting to log a value of "{'accuracy': 0.7949268912905276}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7963768071805374}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 1.6683437824249268,
 'eval_accuracy': {'accuracy': 0.7949268912905276},
 'eval_precision': {'precision': 0.7963768071805374},
 'eval_runtime': 28.4818,
 'eval_samples_per_second': 2761.408,
 'eval_steps_per_second': 86.301,
 'epoch': 29.99}

In [None]:
trainer.push_to_hub(commit_message="1 epoch")

In [None]:
trainer.save_model()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
