## GAN generating disaster tweets

In [None]:
!gdown 1_0B8RD8TaRqYTwxjpO0mViXfJhihg6Ai

Downloading...
From: https://drive.google.com/uc?id=1_0B8RD8TaRqYTwxjpO0mViXfJhihg6Ai
To: /content/nlp-getting-started.zip
  0% 0.00/607k [00:00<?, ?B/s]100% 607k/607k [00:00<00:00, 170MB/s]


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.7 MB/s[0m eta [36m0:00:0

In [None]:
!unzip /content/nlp-getting-started.zip

Archive:  /content/nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np
import re
import string

In [None]:
df = pd.read_csv("train.csv", encoding="ISO-8859-1")
df = df.dropna()
df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [None]:
texts = df[df['target'] == 1]
tweets = texts['text']

In [None]:
# cleaning texts1:

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    return text

In [None]:
preprocessed_data = pd.DataFrame([preprocess_text(sentence) for sentence in tweets])
preprocessed_data = preprocessed_data.dropna()
preprocessed_data.head()

Unnamed: 0,0
0,@bbcmtd Wholesale Markets ablaze
1,#AFRICANBAZE: Breaking news:Nigeria flag set a...
2,INEC Office in Abia Set Ablaze -
3,How the West was burned: Thousands of wildfire...
4,Deputies: Man shot before Brighton home set ab...


In [None]:
preprocessed_data.to_csv('tweets.txt', index=False)

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [59]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

def load_sentences(file_path):
    # Read the sentences from the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines() if line.strip()]
    return sentences

def preprocess_sentences(sentences, tokenizer, block_size=128):
    # Tokenize and prepare the data for language modeling
    input_ids = []
    for sentence in sentences:
        tokenizer.pad_token = tokenizer.eos_token
        tokenized_sentence = tokenizer(sentence, add_special_tokens=True, truncation=True, padding='max_length', max_length=block_size)
        input_ids.append(tokenized_sentence['input_ids'])

    return input_ids

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    sentences = load_sentences(train_file_path)
    input_ids = preprocess_sentences(sentences, tokenizer)

    # Create a TextDataset
    dataset = TextDataset(tokenizer=tokenizer, file_path="/content/tweets.txt", block_size=128)

    # Create a DataCollatorForLanguageModeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model()



In [None]:
!pip install accelerate -U

In [60]:
# Example usage
train_file_path = '/content/tweets.txt'
model_name = 'gpt2'  # or other GPT-2 variants
output_dir = '/content/output_model'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 100

train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)




Step,Training Loss
500,4.4235
1000,3.508
1500,2.9323
2000,2.5893


In [61]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [62]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/output_model"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [71]:
sequence = " "
max_len = 50

generate_text(sequence, max_len)

  
A dog attack dog attack that injured a school bus last month has been caught on tape. FOXNewYork
Accident fatality rate ratio ratio ratio bicyclist fatalities percent 
'He is so fast.' I asked what he was
