# Importing libraries and downloading dataset

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%%capture
!pip install -q keras-nlp --upgrade

In [3]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras_nlp

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

Using TensorFlow backend


Downloading data from google drive

In [4]:
LABEL_REAL = 1
LABEL_FAKE = 0

In [6]:
df_train = pd.read_csv("../data/interim/train.csv")
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Taking only first 100 rows for pre-training discriminator model

In [8]:
df = df_train.sample(n=1000)

In [9]:
BATCH_SIZE = 4
VAL_SPLIT = 0.8
EPOCHS = 1

Splitting data to train and test

In [10]:
X = df["text"].values
y = df["target"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42, stratify=y)

# Training Discriminator model

Downloading the model from keras_nlp package

In [11]:
preset = "bert_tiny_en_uncased"
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(preset, sequence_length=160, name="preprocessor_4_tweets")
classifier = keras_nlp.models.BertClassifier.from_preset(preset, preprocessor=preprocessor, num_classes=2)

# preset = "albert_base_en_uncased"
# preprocessor = keras_nlp.models.AlbertPreprocessor.from_preset(preset, sequence_length=160, name="preprocessor_4_tweets")
# classifier = keras_nlp.models.AlbertClassifier.from_preset(preset, preprocessor=preprocessor, num_classes=2)

Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/vocab.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/model.h5


Compiling the discrimimnator model with Adam optimizer

In [12]:
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [13]:
history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS,
                         validation_data=(X_val, y_val)
                        )



Creating a submission to measure from it

In [14]:
df_test = pd.read_csv('../data/interim/test.csv')
submission = pd.read_csv("../data/interim/sample_submission.csv")

In [15]:
test_pred = np.argmax(classifier.predict(df_test['text'].values), axis=1)



In [16]:
submission['target'] = test_pred.astype(int)
submission.to_csv('../data/final/submission_beg.csv', index=False)

# Pre-training Generator

## Importing libraries and installing needed dependencies

In [17]:
%%capture
!pip install transformers
!pip install accelerate

In [18]:
import re
import string
import pandas as pd
import numpy as np

from transformers import Trainer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments
from transformers import GPT2Tokenizer, PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast

In [19]:
from transformers import logging

# Set the logging verbosity to ERROR to suppress download messages
logging.set_verbosity(logging.ERROR)

## Preprocessing data for Generator

In [20]:
df = pd.read_csv("../data/interim/train.csv")

labels = df['target']
tweets = df['text']

In [21]:
def preprocess_text(i):
    text = tweets[i]
    label = labels[i]

    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = f"{label}>>>{text}"
    return text

Taking only 1000 samples to simulate data shortage

In [22]:
NUM_SAMPLES = min(1000, len(labels))

indices = np.array(list(range(len(tweets))))
indices = np.random.permutation(indices)

preprocessed_data = pd.DataFrame([preprocess_text(indices[i]) for i in range(NUM_SAMPLES)])
# preprocessed_data = pd.DataFrame([preprocess_text(i) for i in range(NUM_SAMPLES)])
preprocessed_data = preprocessed_data.dropna()
preprocessed_data.to_csv('tweets.txt', index=False)
preprocessed_data.head()

Unnamed: 0,0
0,1>>>[HIGH PRIORITY] SEVERE THUNDERSTORM WATCH ...
1,1>>>70 Years After Atomic Bombs Japan Still St...
2,0>>>quick shut down the show take the stage do...
3,1>>>Japan on Thursday marks the 70th anniversa...
4,0>>>do he love me do he love me not I ain't a ...


## Creating functions for Generator training

In [23]:
from os import path

def load_sentences(file_path):
    """
    Load sentences from a file.

    Parameters:
        file_path (str): The path to the file containing sentences.

    Returns:
        list: A list of sentences read from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines() if line.strip()]
    return sentences

def preprocess_sentences(sentences, tokenizer, block_size=128):
    """
    Tokenize and preprocess a list of sentences.

    Parameters:
        sentences (list): List of sentences to preprocess.
        tokenizer: Tokenizer object from the transformers library.
        block_size (int): The maximum length of the tokenized sentences.

    Returns:
        list: List of tokenized and preprocessed sentences.
    """
    input_ids = []

    for sentence in sentences:
        tokenizer.pad_token = tokenizer.eos_token
        tokenized_sentence = tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=block_size)
        input_ids.append(tokenized_sentence['input_ids'])

    return input_ids

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):

    """
    Train a GPT-like model on a given dataset.

    Parameters:
        train_file_path (str): Path to the training dataset file.
        model_name (str): Name or path of the pre-trained model to fine-tune.
        output_dir (str): Directory to save the trained model and tokenizer.
        overwrite_output_dir (bool): Whether to overwrite the output directory if it exists.
        per_device_train_batch_size (int): Batch size per GPU/CPU for training.
        num_train_epochs (int): Number of training epochs.
        save_steps (int): Number of steps between model checkpoints.

    Returns:
        None
    """

    # Check if the output directory already exists, and if so, use it as the model name
    if path.exists('output_model'):
        model_name = 'output_model'

    # Load tokenizer and sentences
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    sentences = load_sentences(train_file_path)
    input_ids = preprocess_sentences(sentences, tokenizer)

    # Create a TextDataset for training
    dataset = TextDataset(tokenizer=tokenizer, file_path=train_file_path, block_size=128)

    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Save tokenizer configuration
    tokenizer.save_pretrained(output_dir)

    # Load or create GPT-like model
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        report_to='none',
        logging_dir=None
    )

    # Set up trainer and perform training
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    # Train the model
    trainer.train()

    # Save the trained model
    trainer.save_model()

In [24]:
train_file_path = 'tweets.txt'
model_name = 'gpt2'
output_dir = 'output_model'
overwrite_output_dir = True
per_device_train_batch_size = 4
num_train_epochs = 10
save_steps = 3000

train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

{'loss': 3.6341, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 94.7053, 'train_samples_per_second': 21.118, 'train_steps_per_second': 5.28, 'train_loss': 3.634103759765625, 'epoch': 10.0}


In [25]:
def generate_text(sequence, max_length):
    """
    Generate text using a pre-trained GPT-2 model.

    Parameters:
        sequence (str): The input sequence to start text generation.
        max_length (int): The maximum length of the generated text.

    Returns:
        str: The generated text.
    """

    # Load pre-trained GPT-2 model and tokenizer
    gpt_model = GPT2LMHeadModel.from_pretrained('output_model')
    gpt_tokenizer = GPT2Tokenizer.from_pretrained('output_model')

    # Tokenize the input sequence and generate text
    ids = gpt_tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = gpt_model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=gpt_model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )

    # Decode and return the generated text without special tokens
    return gpt_tokenizer.decode(final_outputs[0], skip_special_tokens=True)

def generate_batch_fake(n):
    """
    Generate a batch of fake sentences using the generate_text function.

    Parameters:
        n (int): The number of fake sentences to generate.

    Returns:
        list: A list of generated fake sentences.

    Example usage:
        >>> fake_sentences = generate_batch_fake(1)
        >>> print(fake_sentences)
            Wisdom from the?????? 0@Toucan_brian I just liked that you say 'when there's a fight it breaks out and you've got to stop'.0@jakey930
    """
    fake = list()
    max_len = 50
    sequence = "1>>>" # Initial sequence to start generation

    # Generate n fake sentences
    for i in range(n):
        fake.append(generate_text(sequence, max_len))

    # Remove placeholder and additional tokens from the generated sentences
    fake_ = [sentence.replace('1>>>', '') for sentence in fake]
    fake_f = [sentence.replace('>>>', '') for sentence in fake_]
    fake_final = [sentence.replace('\n', '') for sentence in fake_f]

    return fake_final

In [26]:
fake = generate_batch_fake(1)
fake[0]

'Wreckage: 0Why are criminals still killing civilians? @JustinRove @JustinMScrobbins @davidgolfer @sam_golfer @BjRyleB'

# Train Generator and Discriminator together

In [27]:
examples = pd.read_csv('train.csv')
examples.drop([examples.columns[i] for i in range(3)], axis=1, inplace=True)
examples = examples[examples['target'] == LABEL_REAL]
examples['text'].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

##Defining training functions

In [28]:
def train_discriminator(_sentences, _labels, epochs=2):
    """
    Train a discriminator model on input sentences and labels.

    Parameters:
        _sentences (list): List of input sentences for training.
        _labels (list): List of corresponding labels for training.
        epochs (int): Number of training epochs (default is 2).

    Returns:
        None
    """
    classifier.fit(x=_sentences,
                   y=_labels,
                   epochs=epochs,
                   verbose=0)

def test_discriminator(sentences):
    """
    Test a discriminator model on input sentences.

    Parameters:
        sentences (list): List of input sentences for testing.

    Returns:
        numpy.ndarray: Predicted labels for the input sentences.
    """
    # Using numpy to find the index of the maximum value in the predictions
    return np.argmax(classifier.predict(sentences, verbose=0), axis=1)

In [29]:
def accuracy(labels, predicted):
    """
    Calculate the accuracy of predicted labels compared to true labels.

    Parameters:
        labels (list): List of true labels.
        predicted (list): List of predicted labels.

    Returns:
        float: Accuracy as a percentage (value between 0 and 1).

    Example Usage:
        >>> true_labels =       [1, 0, 1, 1, 0]
        >>> predicted_labels =  [1, 0, 0, 1, 1]
        >>> acc = accuracy(true_labels, predicted_labels)
        >>> print(f"Accuracy: {acc}")
    """
    total = 0
    n = len(labels)

    # Count the number of correct predictions
    for i in range(n):
        if labels[i] == predicted[i]:
            total += 1

    # Calculate accuracy as the ratio of correct predictions to the total number of samples
    accuracy_value = total / n
    return accuracy_value

In [30]:
import random

def unison_shuffled_copies(a, b):
    """
    Shuffle two lists in unison.

    Parameters:
        a (list): First list to shuffle.
        b (list): Second list to shuffle.

    Returns:
        tuple: A tuple of shuffled lists (a, b).
    """
    assert len(a) == len(b)
    zipped = list(zip(a, b))
    random.shuffle(zipped)
    return zip(*zipped)

def training(N_EXAMPLES, debug=True, debug_file='debug.txt'):
    """
    Perform a training process, including data generation, model training, and evaluation.

    Parameters:
        N_EXAMPLES (int): Number of examples to generate and use for training.

    Returns:
        None
    """

    # Generate fake sentences and labels
    sentences = generate_batch_fake(N_EXAMPLES)
    labels = [LABEL_FAKE for i in range(len(sentences))]

    # Generate additional examples for comparison
    sentences.extend(list(examples['text'].sample(n=N_EXAMPLES).values))
    labels.extend([LABEL_REAL for i in range(len(sentences) - len(labels))])

    # Shuffle the sentences and labels together
    sentences, labels = unison_shuffled_copies(sentences, labels)

    # Convert to lists
    sentences_list = list(sentences)
    labels_list = list(labels)

    if debug:
        with open(debug_file, 'w') as f:
            f.write('Sentences: [')
            for i, sentence in enumerate(sentences_list):
                f.write('"' + sentence + '"')
                if i != len(sentences_list) - 1:
                    f.write(', ')
            f.write(']')

    # Test the discriminator model on the generated data
    values = test_discriminator(sentences_list)

    print('Labels: ', labels_list)
    print('Preds:  ', list(values))

    # Print and log the accuracy of the evaluator
    print(f"Accuracy of Evaluator: {accuracy(labels_list, values)}")

    # Train the discriminator odel on the shuffled data for 2 epochs
    train_discriminator(sentences_list, labels_list, 2)

    print(f'GPT training logs: ', end='')
    # Save the labeled data to a file for further training
    file_path = "tweets_supervision.txt"
    with open(file_path, 'w') as f:
        f.write("0\n")
        for i in range(len(values)):
            if values[i] == LABEL_FAKE and labels_list[i] == LABEL_FAKE:
                continue

            label_to_write = values[i]
            if labels_list[i] == LABEL_FAKE:
                label_to_write = LABEL_REAL
            f.write(f"{label_to_write}>>>{sentences_list[i]}\n")

    # Train the GPT-2 model using the labeled data
    train_file_path = 'tweets_supervision.txt'
    model_name = 'gpt2'
    output_dir = 'output_model'
    overwrite_output_dir = True
    per_device_train_batch_size = len(values)
    num_train_epochs = 3
    save_steps = 1

    train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)

## Starting the training

In [31]:
NUM_EPOCHS = 5
TRAINING_SAMPLES = 20

for i in range(NUM_EPOCHS):
    print(f'Epoch #{i + 1}:')
    training(TRAINING_SAMPLES)
    print()

Epoch #1:
Labels:  [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Preds:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy of Evaluator: 0.55
GPT training logs: {'train_runtime': 21.587, 'train_samples_per_second': 0.695, 'train_steps_per_second': 0.139, 'train_loss': 4.371808369954427, 'epoch': 3.0}

Epoch #2:
Labels:  [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Preds:   [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy of Evaluator: 0.7
GPT training logs: {'train_runtime': 37.6216, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.08, 'train_loss': 3.665914535522461, 'epoch': 3.0}

Epoch #3:
Labels:  [0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 

## Evaluating the discriminator

In [32]:
test_pred = np.argmax(classifier.predict(df_test['text'].values), axis=1)



In [33]:
submission['target'] = test_pred.astype(int)
submission.to_csv('../data/final/submission_end.csv', index=False)

## Printing samples of generated text

In [34]:
PRINT_EXAMPLES = 5

generated_sentences = generate_batch_fake(PRINT_EXAMPLES)
for sentence in generated_sentences:
    print(sentence, '\n', sep='')

#Vietnam War Medallion #72nd Wounded Legionnaire's Photo Released by the [US] Secret Services #Criminal_War... #veterans #entertainment #fiction 0#best #film

Greetings All!!To all of you burning with panic &amp; &amp; &amp; this is for the fans.To all of you who stand with us in fighting for #SaveOurMusic:0

#GPS HotSpot: Several people have been evacuated from a home in southern California after a lightning strike northeast of town. (ABC News) http://t.co/U0mAjQJxf2 https://t

USATODAY.COM I'M BEING APOCALYPSE #BooMIA. Fox News.?? #jewishtourism.????0The Latest: More homes have been razed by

#hot  Discussing my personal favorite movie of all time: Star Wars   @BobbyC-10@ROBERGAMES GO AWAY ASKING FOR YOUR HELP WITH THE ENTIRE FILM COSTUME



## Comparing performances

In [35]:
history2 = classifier.evaluate(X_val, y_val)



In [36]:
print('PrevAcc: ', round(history.history["val_accuracy"][0], 2), 'PrevLoss:', round(history.history["val_loss"][0], 2))
print('CurAcc:  ', round(history2[1], 2), 'CurLoss: ', round(history2[0], 2))

PrevAcc:  0.61 PrevLoss: 0.68
CurAcc:   0.61 CurLoss:  1.01


# Generating fake tweets

In [40]:
gen_sent = generate_batch_fake(50)
with open('../data/final/generated.txt', 'w') as f:
    for sent in gen_sent:
        f.write(sent + '\n')

# Clearing files

In [None]:
import os

test = os.listdir()

for item in test:
    if not item.endswith(".ipynb"):
        os.remove(item)