# installingNecessary Libraries

In [None]:
!pip install transformers datasets nltk spacy pandas torch sentencepiece
!python -m spacy download en_core_web_sm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

# Loading Dataset

In [None]:
from datasets import load_dataset

# Load WikiText-103
dataset = load_dataset("wikitext", "wikitext-2-v1")
print(dataset["train"][0]["text"])  # Sample text

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/685k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.07M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/618k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]




In [None]:
# Check number of samples in each split
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

Train samples: 36718
Validation samples: 3760
Test samples: 4358


# Data Preprocessing

In [None]:
import re
import spacy
from transformers import GPT2Tokenizer

nlp = spacy.load("en_core_web_sm")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # Define the padding token

def clean_text(text):
    # Remove special chars, URLs, and extra spaces
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#|\n', ' ', text)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text.strip()

def tokenize_and_lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Apply cleaning
dataset = dataset.map(lambda x: {"cleaned_text": clean_text(x["text"])})
dataset = dataset.map(lambda x: {"processed_text": tokenize_and_lemmatize(x["cleaned_text"])})

# Tokenize for GPT-2
def tokenize(batch):
    tokenized_batch = tokenizer(batch["processed_text"], truncation=True, max_length=512, padding="max_length")
    # Add labels for language modeling (targets are the same as inputs shifted)
    tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()
    return tokenized_batch

tokenized_dataset = dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Splitting The Data

In [None]:
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_data = split_dataset["train"]
val_data = split_dataset["test"]

# Finetuning gpt2

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import wandb

# Initialize wandb (add your API key if needed)
wandb.init(project="gpt2-wikitext2")

# Initialize model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token_id to eos_token_id
model.config.pad_token_id = model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token  # Ensure tokenizer uses EOS token as padding

# Example data preparation (make sure your train_data and val_data are tokenized correctly)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Optimized training configuration
training_args = TrainingArguments(
    output_dir="./gpt2-wikitext2",  # Your original directory
    per_device_train_batch_size=8,  # Max for T4 GPU (2x faster)
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    fp16=True,                      # Mixed precision (2x speed)
    logging_steps=100,
    save_strategy="epoch",          # Saves checkpoints
    save_total_limit=2,
    eval_strategy="epoch",          # Evaluation enabled
    load_best_model_at_end=True,    # Keeps best model
    warmup_steps=100,               # Faster convergence
    report_to="wandb",              # Enables wandb logging
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,          # Required for eval_strategy
)

# Train the model
trainer.train()

# Save final model (same structure as original)
model.save_pretrained("./gpt2-wikitext2/final_model")
tokenizer.save_pretrained("./gpt2-wikitext2/final_model")

print("Training complete! Final model saved to './gpt2-wikitext2/final_model'")



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.3502,0.3336
2,0.3407,0.324433
3,0.3241,0.322543


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Training complete! Final model saved to './gpt2-wikitext2/final_model'


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fine-tuned GPT-2
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/gpt2-wikitext2/final_model")
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/gpt2-wikitext2/final_model")

# Generate sample and compute BLEU
def evaluate_bleu(model, tokenizer, reference_text):
    input_ids = tokenizer.encode(reference_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=50)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    reference = [reference_text.split()]
    candidate = generated_text.split()
    return sentence_bleu(reference, candidate)

bleu_score = evaluate_bleu(model, tokenizer, "The quick brown fox jumps")
print(f"BLEU Score: {bleu_score}")




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


BLEU Score: 0.1250076305588977


# Apache Airflow Setup in Colab

In [None]:
# Set up environment
import os
os.environ["AIRFLOW_HOME"] = "/content/airflow"

# Install Airflow
!pip install apache-airflow==2.6.3 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.8.txt" --quiet

# Create necessary folders
!mkdir -p /content/airflow/dags
!airflow db init

# Write your DAG
dag_code = """from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python_operator import PythonOperator

def preprocess_data():
    print("Preprocessing data...")

def train_model():
    print("Training the model...")

def generate_content():
    print("Generating content...")

def postprocess_results():
    print("Post-processing results...")

default_args = {
    'owner': 'shahd',
    'depends_on_past': False,
    'start_date': datetime(2025, 4, 1),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

with DAG(
    'generative_ai_pipeline',
    default_args=default_args,
    description='Generative AI Pipeline with Airflow',
    schedule_interval='@daily',
    catchup=False,
) as dag:

    task_preprocess = PythonOperator(
        task_id='preprocess_data',
        python_callable=preprocess_data
    )

    task_train = PythonOperator(
        task_id='train_model',
        python_callable=train_model
    )

    task_generate = PythonOperator(
        task_id='generate_content',
        python_callable=generate_content
    )

    task_postprocess = PythonOperator(
        task_id='postprocess_results',
        python_callable=postprocess_results
    )

    task_preprocess >> task_train >> task_generate >> task_postprocess
"""
with open("/content/airflow/dags/generative_ai_pipeline_dag.py", "w") as f:
    f.write(dag_code)

# Create Airflow user (only needs to be done once)
!airflow users create \
    --username admin \
    --firstname Shahd \
    --lastname Admin \
    --email shahd@example.com \
    --role Admin \
    --password adminpassword


DB: sqlite:////content/airflow/airflow.db
[[34m2025-04-25T02:43:47.626+0000[0m] {[34mmigration.py:[0m213} INFO[0m - Context impl [01mSQLiteImpl[22m.[0m
[[34m2025-04-25T02:43:47.627+0000[0m] {[34mmigration.py:[0m216} INFO[0m - Will assume [01mnon-transactional[22m DDL.[0m
[[34m2025-04-25T02:43:47.793+0000[0m] {[34mmigration.py:[0m213} INFO[0m - Context impl [01mSQLiteImpl[22m.[0m
[[34m2025-04-25T02:43:47.793+0000[0m] {[34mmigration.py:[0m216} INFO[0m - Will assume [01mnon-transactional[22m DDL.[0m
[[34m2025-04-25T02:43:47.795+0000[0m] {[34mdb.py:[0m1591} INFO[0m - Creating tables[0m
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
WARNI [unusual_prefix_19638d6c2ac35c14f65ccf0e474b8b1cafb70fa4_tutorial_taskflow_api_virtualenv] The tutorial_taskflow_api_virtualenv example DAG requires virtualenv, please install it.
WARNI [unusual_prefix_10765d35ea3754dc632e0439fbd4ee37c2f51

In [None]:
!pip install pyngrok --quiet
from pyngrok import ngrok
import time

# Set your real authtoken
ngrok.set_auth_token("2w9JtMI8fEuiIytsX4Wsve888J9_3nW29NkXWjEJeVsi2pR6o")

# Start scheduler in background
!airflow scheduler > /content/scheduler.log 2>&1 &

# Start ngrok tunnel
public_url = ngrok.connect(8090)
print("🔗 Airflow UI:", public_url)

# Wait a few seconds for everything to initialize
time.sleep(5)

# Start webserver and keep it running
!airflow webserver -p 8090 > /content/webserver.log 2>&1



🔗 Airflow UI: NgrokTunnel: "https://0487-34-141-141-197.ngrok-free.app" -> "http://localhost:8090"




^C


In [None]:
!tail -n 50 /content/webserver.log


[2025-04-25 02:52:37 +0000] [16520] [INFO] Starting gunicorn 20.1.0
[2025-04-25 02:52:37 +0000] [16520] [INFO] Listening at: http://0.0.0.0:8090 (16520)
[2025-04-25 02:52:37 +0000] [16520] [INFO] Using worker: sync
[2025-04-25 02:52:37 +0000] [16576] [INFO] Booting worker with pid: 16576
[2025-04-25 02:52:37 +0000] [16578] [INFO] Booting worker with pid: 16578
[2025-04-25 02:52:37 +0000] [16579] [INFO] Booting worker with pid: 16579
[2025-04-25 02:52:37 +0000] [16581] [INFO] Booting worker with pid: 16581
[2025-04-25 02:53:24 +0000] [16520] [INFO] Handling signal: int
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
Running the Gunicorn Server with:
Workers: 4 sync
Host: 0.0.0.0:8090
Timeout: 120
Logfiles: - -
Access Logformat: 
[[34m2025-04-25T02:53:24.935+0000[0m] {[34mwebserver_command.py:[0m432

In [None]:
!airflow webserver --debug -p 8090


  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
Starting the web server on port 8090 and host 0.0.0.0.
 * Serving Flask app 'airflow.www.app'
 * Debug mode: on
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8090
 * Running on http://172.28.0.12:8090[0m
[[34m2025-04-25T02:54:00.838+0000[0m] {[34m_internal.py:[0m224} INFO[0m - [33mPress CTRL+C to quit[0m[0m
[[34m2025-04-25T02:54:00.839+0000[0m] {[34m_internal.py:[0m224} INFO[0m -  * Restarting with stat[0m
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
Starting the web server on port 8090 and host 0.0.0.0.
[[34m2025-04-25T02:54:04

In [None]:
from pyngrok import ngrok

# If not done yet, set your actual ngrok authtoken
ngrok.set_auth_token("2w9JtMI8fEuiIytsX4Wsve888J9_3nW29NkXWjEJeVsi2pR6o")

# Tunnel port 8090 (since Airflow is running there)
public_url = ngrok.connect(8090)
print("🌐 Airflow UI is available at:", public_url)


🌐 Airflow UI is available at: NgrokTunnel: "https://fd41-34-141-141-197.ngrok-free.app" -> "http://localhost:8090"


In [None]:
import threading
import time
import os
from pyngrok import ngrok

# Set environment
os.environ['AIRFLOW_HOME'] = '/content/airflow'

# Set your actual ngrok authtoken
ngrok.set_auth_token("2w9JtMI8fEuiIytsX4Wsve888J9_3nW29NkXWjEJeVsi2pR6o")

# Function to run Airflow webserver
def run_airflow():
    os.system("airflow webserver -p 8090")

# Start Airflow webserver in a thread
t = threading.Thread(target=run_airflow)
t.start()

# Give Airflow a few seconds to start
time.sleep(10)

# Open Ngrok tunnel to Airflow
public_url = ngrok.connect(8090)
print("🌐 Airflow UI is available at:", public_url)


🌐 Airflow UI is available at: NgrokTunnel: "https://a384-34-141-141-197.ngrok-free.app" -> "http://localhost:8090"


In [None]:
import threading

# Function to run the scheduler
def run_scheduler():
    os.system("airflow scheduler")

# Start the scheduler in a background thread
scheduler_thread = threading.Thread(target=run_scheduler)
scheduler_thread.start()


In [None]:
!airflow dags trigger generative_ai_pipeline


[[34m2025-04-25T03:05:59.845+0000[0m] {[34m__init__.py:[0m42} INFO[0m - Loaded API auth backend: [01mairflow.api.auth.backend.session[22m[0m
[1m     [0m|[1m      [0m|[1m      [0m|[1m      [0m|[1m      [0m|[1m      [0m|[1m      [0m|[1m [0m[1mlast[0m[1m [0m|[1m     [0m|[1m      [0m|[1m     [0m|[1m      [0m
[1m     [0m|[1m      [0m|[1m      [0m|[1m [0m[1mdata[0m[1m [0m|[1m [0m[1mdata[0m[1m [0m|[1m      [0m|[1m      [0m|[1m [0m[1m_sch[0m[1m [0m|[1m     [0m|[1m      [0m|[1m     [0m|[1m      [0m
[1m     [0m|[1m      [0m|[1m      [0m|[1m [0m[1m_int[0m[1m [0m|[1m [0m[1m_int[0m[1m [0m|[1m      [0m|[1m [0m[1mexte[0m[1m [0m|[1m [0m[1medul[0m[1m [0m|[1m [0m[1mlog[0m[1m [0m|[1m      [0m|[1m [0m[1msta[0m[1m [0m|[1m      [0m
[1m     [0m|[1m      [0m|[1m [0m[1mdag_[0m[1m [0m|[1m [0m[1merva[0m[1m [0m|[1m [0m[1merva[0m[1m [0m|[1m      [0m|[1m [0m[1mrnal[0m[