In [43]:
import pandas as pd

## Setup

Format the Train and Test datasets to have 0=human, 1=machine
Removes extraneous columns

In [44]:
train_df = pd.read_csv('archive/train.csv', header=0)
train_df.drop(["screen_name", "class_type"], axis = 1, inplace = True) 
for key, val in train_df.iteritems():
    val[val == 'human'] = 0
    val[val == 'bot'] = 1
train_df = train_df[['account.type', 'text']]

train_df.head()

Unnamed: 0,account.type,text
0,1,YEA now that note GOOD
1,0,Listen to This Charming Man by The Smiths htt...
2,1,wish i can i would be seeing other hoes on the...
3,1,The decade in the significantly easier schedul...
4,1,"""Theim class=\""alignnone size-full wp-image-60..."


In [45]:
test_df = pd.read_csv('archive/test.csv', header=0)
test_df.drop(["screen_name", "class_type"], axis = 1, inplace = True) 
for key, val in test_df.iteritems():
    val[val == 'human'] = 0
    val[val == 'bot'] = 1
test_df = test_df[['account.type', 'text']]

test_df.head()

Unnamed: 0,account.type,text
0,0,justin timberlake really one of the goats if y...
1,0,Thank you @PMBhutan for your gracious prayers ...
2,0,Theory: the number of red lights you will hit ...
3,1,Respects on the Upt of the I good with the peo...
4,0,Might give the BASIC #10Liner game contest ano...


Edits the CSV files to follow the format that BERT wants

Column 0: An ID for the row
Column 1: The label for the row (should be an int)
Column 2: A column of the same letter for all rows. BERT wants this so we’ll give it, but we don’t have a use for it.
Column 3: The text for the row

In [46]:
train_df_bert = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df['account.type'],
    'alpha':['a']*train_df.shape[0],
    'text': train_df['text'].replace(r'\n', ' ', regex=True)
})
train_df_bert.head()

Unnamed: 0,id,label,alpha,text
0,0,1,a,YEA now that note GOOD
1,1,0,a,Listen to This Charming Man by The Smiths htt...
2,2,1,a,wish i can i would be seeing other hoes on the...
3,3,1,a,The decade in the significantly easier schedul...
4,4,1,a,"""Theim class=\""alignnone size-full wp-image-60..."


In [47]:
dev_df_bert = pd.DataFrame({
    'id':range(len(test_df)),
    'label':test_df['account.type'],
    'alpha':['a']*test_df.shape[0],
    'text': test_df['text'].replace(r'\n', ' ', regex=True)
})
dev_df_bert.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,justin timberlake really one of the goats if y...
1,1,0,a,Thank you @PMBhutan for your gracious prayers ...
2,2,0,a,Theory: the number of red lights you will hit ...
3,3,1,a,Respects on the Upt of the I good with the peo...
4,4,0,a,Might give the BASIC #10Liner game contest ano...


Save the Data into .tsv files

In [48]:
train_df_bert.to_csv('data/train.csv', index=False, header=False)

In [49]:
dev_df_bert.to_csv('data/dev.csv', index=False, header=False)

## Data to Features

In [50]:
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from bert_tools import *
import convert_examples_to_features

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [51]:
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "data/"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = 'bert-base-cased'

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'tweep_tuned'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-3
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [52]:
output_mode = OUTPUT_MODE

cache_dir = CACHE_DIR

In [53]:
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

In [54]:
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [55]:
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)

In [56]:
label_list = processor.get_labels() # [0, 1] for binary classification
num_labels = len(label_list)

In [57]:
num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [58]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\darkl\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [59]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, {int(key):label_map[key] for key in label_map}, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples]
example, label_map, max_seq_length, tokenizer, output_mode = train_examples_for_processing[0]

In [60]:
process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(convert_examples_to_features.convert_example_to_feature, train_examples_for_processing), total=train_examples_len))

Preparing to convert 20712 examples..
Spawning 11 processes..


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20712.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
with open(DATA_DIR + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

Download Pre Trained Model

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=num_labels)

Training Config

In [None]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)

In [None]:
print(train_examples_for_processing[0])

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer2 = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

token_list = tokenizer2(train_df_bert['text'].tolist(), truncation=True, padding=True)
label_list = train_df_bert['label'].tolist()

In [None]:
import torch

class TweepDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        '''Initialization'''
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        '''Length of the number of samples'''
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
params = {'batch_size': 1024,
          'shuffle': True,
          'num_workers': 8}
    
def dummy_data_collector(input_id, input_mask, segment_id, label_id):
    batch = {}
    batch['input_ids'] = input_id
    batch['input_mask'] =  input_mask
    batch['segment_id'] = segment_id
    batch['labels'] = label_id
    
    return batch    
    
training_set = TweepDataset(token_list[:int(len(token_list)/2)], label_list[:int(len(token_list)/2)])
validation_set = TweepDataset(dev_df_bert['text'].tolist()[:int(len(token_list)/2)], dev_df_bert['label'].tolist()[:int(len(token_list)/2)])

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=1024,   # batch size for evaluation
    warmup_steps=1,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1000,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    # train_dataset=train_data,         # training dataset
    train_dataset=training_set,
    # eval_dataset=eval_data            # evaluation dataset
    eval_dataset=validation_set
)

In [None]:
trainer.train()