<a href="https://colab.research.google.com/github/saimdev/NLP_USING_GPT/blob/master/GPTModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m105.1 MB/s[0m eta [36m0:00

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm


In [4]:
training_data = [
    "Introduction to ICT\nICT stands for Information and Communication Technology. It encompasses the technologies used to handle information and aid communication.",
    "Components of a Computer\nA computer consists of several components, including the CPU, memory, storage devices, input devices, and output devices.",
    "Operating Systems\nAn operating system is software that manages computer hardware and software resources and provides common services for computer programs.",
    "Networks and Internet\nA network is a collection of devices that are connected to each other to share resources and communicate. The internet is a global network of networks.",
    "Database Management Systems\nA database management system is software that allows for the storage, organization, and retrieval of data in a structured manner.",
]

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:
tokenized_data = [tokenizer.encode(text) for text in training_data]

In [7]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx])

In [21]:
def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)

In [22]:
train_dataset = CustomDataset(tokenized_data)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [9]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [25]:
num_epochs = 5
progress_bar = tqdm(range(num_epochs), desc="Training")
for epoch in progress_bar:
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()

        # Shift the input sequence to the right (right padding)
        inputs = batch[:, :-1]
        targets = batch[:, 1:]

        # Forward pass
        outputs = model(inputs)
        logits = outputs.logits

        # Compute the loss
        loss = torch.nn.functional.cross_entropy(
            logits.reshape(-1, logits.shape[-1]), targets.reshape(-1)
        )
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    progress_bar.set_postfix({"loss": average_loss})

Training: 100%|██████████| 5/5 [00:15<00:00,  3.05s/it, loss=2.44]


In [26]:
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json')

**MODEL IS TRAINED, NOW WE ARE USING THIS TO TEST**

In [27]:
model = GPT2LMHeadModel.from_pretrained("fine_tuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_model")

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [35]:
def generate_content(prompt, max_length=100, num_return_sequences=1):
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device)
    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.1,
        top_k=50,
        top_p=0.92,
    )

    generated_texts = []
    for output_sequence in output_sequences:
        generated_text = tokenizer.decode(output_sequence, skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

In [36]:
prompt = "Can you explain the concept of computer networks?"
generated_content = generate_content(prompt)

In [37]:
for text in generated_content:
    print(text)

Can you explain the concept of computer networks?

The idea of computer networks is that you can connect computers to each other. You can connect computers to each other by sending data. You can connect computers to each other by sending data. You can connect computers to each other by sending data. You can connect computers to each other by sending data. You can connect computers to each other by sending data. You can connect computers to each other by sending data. You can connect computers to each other by
