# Pre-training Discriminator

In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

Collecting keras-core
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras-core)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, keras-core
Successfully installed keras-core-0.1.7 namex-0.0.7
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.1/590.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras_core as keras
import keras_nlp

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

Using TensorFlow backend


In [None]:
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
BATCH_SIZE = 4
VAL_SPLIT = 0.2
EPOCHS = 1

In [None]:
X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

In [None]:
preset = "distil_bert_base_en_uncased"
preset = "bert_tiny_en_uncased"

# Distil
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(preset, sequence_length=160, name="preprocessor_4_tweets")
classifier = keras_nlp.models.BertClassifier.from_preset(preset, preprocessor=preprocessor, num_classes=2)

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/vocab.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/model.h5


In [None]:
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(1e-5),
    metrics=["accuracy"]
)

In [None]:
history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS,
                         validation_data=(X_val, y_val)
                        )



# Pre-training Generator

In [None]:
%%capture
!pip install transformers
!pip install accelerate

In [None]:
import re
import string
import pandas as pd
import numpy as np

In [None]:
from transformers import Trainer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments
from transformers import GPT2Tokenizer, PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast

In [None]:
df = pd.read_csv("train.csv")

texts = df[df['target'] == 1]
tweets = texts['text']

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    return text

In [None]:
preprocessed_data = pd.DataFrame([preprocess_text(sentence) for sentence in tweets])
preprocessed_data = preprocessed_data.dropna()
preprocessed_data.to_csv('tweets.txt', index=False)
preprocessed_data.head()

Unnamed: 0,0
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...


In [None]:
from os import path

def load_sentences(file_path):
    # Read the sentences from the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines() if line.strip()]
    return sentences

def preprocess_sentences(sentences, tokenizer, block_size=128):
    # Tokenize and prepare the data for language modeling
    input_ids = []
    for sentence in sentences:
        tokenizer.pad_token = tokenizer.eos_token
        tokenized_sentence = tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=block_size)
        input_ids.append(tokenized_sentence['input_ids'])

    return input_ids

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):

    # если есть папка output_model, 'output_model', else 'model_name'
    if path.exists('output_model'):
        model_name = 'output_model'

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    sentences = load_sentences(train_file_path)
    input_ids = preprocess_sentences(sentences, tokenizer)

    # Create a TextDataset
    dataset = TextDataset(tokenizer=tokenizer, file_path=train_file_path, block_size=128)

    # Create a DataCollatorForLanguageModeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        report_to='none'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model()

In [None]:
train_file_path = 'tweets.txt'
model_name = 'gpt2'
output_dir = 'output_model'
overwrite_output_dir = True
per_device_train_batch_size = 4
num_train_epochs = 20
save_steps = 1000

train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss
500,3.9136
1000,2.9683
1500,2.3358
2000,1.8956
2500,1.592
3000,1.4403


In [None]:
def generate_text(sequence, max_length):
    gpt_model = GPT2LMHeadModel.from_pretrained('output_model')
    gpt_tokenizer = GPT2Tokenizer.from_pretrained('output_model')
    ids = gpt_tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = gpt_model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=gpt_model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return gpt_tokenizer.decode(final_outputs[0], skip_special_tokens=True)

def generate_batch_fake(n):
    fake = list()
    max_len = 50
    sequence = " "

    for i in range(n):
        fake.append(generate_text(sequence, max_len))

    return fake

# Test Them

In [None]:
examples = pd.read_csv('train.csv')
examples.drop([examples.columns[i] for i in range(3)], axis=1, inplace=True)
examples = examples[examples['target'] == 1]
examples['text'].head()

15                  What's up man?
16                   I love fruits
17                Summer is lovely
18               My car is so fast
19    What a goooooooaaaaaal!!!!!!
Name: text, dtype: object

In [None]:
def train_bert(sentences, labels, epochs=2):
    classifier.fit(x=sentences,
                   y=labels,
                   epochs=EPOCHS,
                   shuffle=True)

def test_bert(sentences):
    return np.argmax(classifier.predict(sentences), axis=1)

In [None]:
import random

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    zipped = list(zip(a, b))
    random.shuffle(zipped)
    return zip(*zipped)

def training(N_EXAMPLES):
    sentences = generate_batch_fake(N_EXAMPLES)
    labels = [1 for i in range(len(sentences))]

    # save generated sentences
    generated = set(sentences)
    sentences.extend(examples['text'].sample(n=N_EXAMPLES))
    labels.extend([0 for i in range(len(sentences) - len(labels))])

    # shuffle
    # sentences, labels = unison_shuffled_copies(sentences, labels)

    # propagate through test
    values = test_bert(sentences)
    # sort out the ones that are fake, but classified as true
    tricky = [sentences[i] for i in range(len(values)) if values[i] > labels[i]]

    # train classifier for one epoch
    train_bert(sentences, labels, 2)

    if len(tricky) == 0:
        return

    file_path = "tweets_supervision.txt"
    # Open the file for writing
    with open(file_path, 'w') as f:
        # Write each sentence in 'tricky' list to the file
        for sentence in tricky:
            f.write(sentence + '\n')

    train_file_path = 'tweets_supervision.txt'
    model_name = 'gpt2'  # or other GPT-2 variants
    output_dir = 'output_model'
    overwrite_output_dir = True
    per_device_train_batch_size = len(tricky)
    num_train_epochs = 2
    save_steps = 1

    train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)

In [None]:
training(1)