#Testing on CoLA Dataset

In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading h

In [None]:
from datasets import load_dataset
data = load_dataset("glue", "cola")
unlabelled_dataset=data['train']['sentence'][:850]

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Tokenize the unlabelled data
tokenized_data = [tokenizer.encode(text, add_special_tokens=True) for text in unlabelled_dataset]


In [None]:
import random

# Mask a random percentage of tokens in each sequence
masked_data = []
for sequence in tokenized_data:
    masked_sequence = sequence.copy()
    num_masks = int(len(sequence) * 0.15)  # Mask 15% of the tokens
    indices = random.sample(range(1, len(sequence) - 1), num_masks)  # Exclude special tokens [CLS] and [SEP]
    for index in indices:
        if random.random() < 0.8:  # 80% of the time, replace the token with [MASK]
            masked_sequence[index] = tokenizer.mask_token_id
        elif random.random() < 0.5:  # 10% of the time, replace the token with a random token
            masked_sequence[index] = random.randint(0, len(tokenizer) - 1)
    masked_data.append(masked_sequence)


In [None]:
import torch

In [None]:
# Set the maximum sequence length
max_length = 512

# Pad the sequences that are shorter than max_length
padded_data = []
for sequence in masked_data:
    if len(sequence) < max_length:
        sequence += [tokenizer.pad_token_id] * (max_length - len(sequence))
    padded_data.append(sequence)

# Convert the padded data to PyTorch tensors
input_ids = torch.tensor(padded_data)

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.optim import AdamW
from tqdm import tqdm  # Import tqdm for the progress bar

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertForMaskedLM.from_pretrained("bert-base-cased")
device = torch.device("cuda")
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Loop through the data by batch
for epoch in range(1):
    for batch in tqdm(input_ids):  # Add tqdm to loop for progress bar
        batch = batch.to(device)  # Move the batch to the GPU

        # Set 15% of tokens to be masked
        mask_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool()
        batch[mask_indices] = tokenizer.mask_token_id

        # Set another 15% of tokens to be randomly replaced
        random_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool() & ~mask_indices
        random_tokens = torch.randint(len(tokenizer), batch.shape, dtype=torch.long).to(device)  # Move to the GPU
        batch[random_indices] = random_tokens[random_indices]

        # Clear gradients and compute forward pass
        optimizer.zero_grad()
        loss = model(input_ids=batch.unsqueeze(0), labels=batch.unsqueeze(0))[0]

        # Backpropagate the loss and update the weights
        loss.backward()
        optimizer.step()


In [None]:
# Input a sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)

# Mask a random token in the sentence
masked_index = torch.randint(len(tokens), (1,)).item()
tokens[masked_index] = tokenizer.mask_token

# Convert tokens to input_ids
input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])

# Move input_ids to the GPU
input_ids = input_ids.to(device)

# Get the model's predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0]

# Get the predicted token for the masked position
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

# Print the original sentence, masked sentence, and predicted token
print("Original sentence:", sentence)
print("Masked sentence:", tokenizer.convert_tokens_to_string(tokens))
print("Predicted token:", predicted_token)


In [None]:
# Ask a question
question = "hello"
question_tokens = tokenizer.tokenize(question)
question_ids = torch.tensor([tokenizer.convert_tokens_to_ids(question_tokens)])

# Move question_ids to the GPU
question_ids = question_ids.to(device)

# Get the model's predictions for the question
with torch.no_grad():
    outputs = model(input_ids=question_ids)
    predictions = outputs[0]

# Get the predicted token for the masked position
predicted_index = torch.argmax(predictions[0, -1]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

# Print the question and predicted answer
print("Question:", question)
print("Answer:", predicted_token)


In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()

#Slips Preprocess document files

In [None]:
import requests
import zipfile
import io
import os

if not(os.path.exists('master.zip')):
    !wget https://github.com/stratosphereips/StratosphereLinuxIPS/archive/refs/heads/master.zip
    !unzip master.zip

    import shutil

    # Define the path of the docs folder
    docs_folder = "StratosphereLinuxIPS-master/docs"

    # Move the docs folder to the current working directory
    shutil.move(docs_folder, ".")

--2023-04-13 15:00:17--  https://github.com/stratosphereips/StratosphereLinuxIPS/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/stratosphereips/StratosphereLinuxIPS/zip/refs/heads/master [following]
--2023-04-13 15:00:17--  https://codeload.github.com/stratosphereips/StratosphereLinuxIPS/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.112.9
Connecting to codeload.github.com (codeload.github.com)|140.82.112.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [        <=>         ]  19.47M  8.46MB/s    in 2.3s    

2023-04-13 15:00:20 (8.46 MB/s) - ‘master.zip’ saved [20414766]

Archive:  master.zip
99d9c3b725a4ab6b754f3a4f87085f09edc64ab5
   creating: Stratospher

In [None]:
!ls -R ./docs 

./docs:
architecture.md        FAQ.md		Makefile
code_documentation.md  features.md	P2P.md
conf.py		       flowalerts.md	slips_in_action.md
contributing.md        images		training.md
create_new_module.md   index.rst	usage.md
detection_modules.md   installation.md
exporting.md	       make.bat

./docs/images:
alerts_log.png			   qakbot.png
directions.png			   saefko_first_alert.png
droidjack_alert.png		   slips.gif
emotet_alerts.png		   slips_internal_architecture.jpg
expanding_infected_timewindow.png  slips_logo.png
kalipso.png			   slips_workflow.png
kalispo_infected_tw.png		   trickbot.png
module.gif			   updating_remote_feeds.png
modules_starting.png		   web_interface.png


In [None]:
import os

# Load the files in the current directory
dir_files = []
docs_contents = os.listdir('docs')

# Loop through each file and print its name if it's a file
for item in docs_contents:
    item_path = os.path.join('docs', item)
    if os.path.isfile(item_path):
        dir_files.append(item_path)

# Load the contents of all the files and concatenate them into a single string
all_text = ""
for file_name in dir_files:
    with open(file_name, 'r') as file:
        content = file.read()
        all_text += content + "\n"

In [None]:
len(all_text)

184644

In [None]:
print(all_text)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the text into words
words = word_tokenize(all_text)

# Remove stop words and punctuation marks
stop_words = set(stopwords.words('english'))
cleaned_words = [word for word in words if word.lower() not in stop_words and word not in punctuation]

# Join the cleaned words into a string
cleaned_text = ' '.join(cleaned_words)

print(cleaned_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




In [None]:
len(cleaned_text)

137097

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

tokens = nltk.word_tokenize(cleaned_text)
pos_tags = nltk.pos_tag(tokens)

filtered_tokens = [word for word, tag in pos_tags if not word.lower() in stop_words and not tag.startswith('P') and not tag.startswith('C')]

cleaned_text = ' '.join(filtered_tokens)
print(cleaned_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.




In [None]:
cleaned_text = cleaned_text.replace("#", "")

In [None]:
len(cleaned_text)

131837

In [None]:
all_text=cleaned_text

In [None]:
len(all_text)

131837

In [None]:
import re
sentences =re.split(r'[.\n]',all_text)
print(len(sentences))

1052


In [None]:
sentences

['Contributing contributions welcomed thank taking time contribute project set guidelines contributing development Slips contribute Run Slips report bugs needed features suggest ideas Pull requests solved GitHub issue new feature Pull request new detection module Persistent Git Branches following git branches Slips repository permanent master contains stable version Slips new versions least month develop contains latest unstable version Slips also latest features new features based branch Naming Git branches Pull Requests keep Git history clean facilitate revision contributions ask branches follow concise namings branch-naming patterns follow contributing Slips author-bugfix- pull request branch contains bugfix author-docs- pull request branch contains documentation work author-enhance- pull request branch contains enhancement new feature improvement nonetheless author-feature- pull request branch contains new feature author-refactor- pull request branch contains code refactoring branc

In [None]:
filtered_sentences = []

for sentence in sentences:
    if len(sentence) < 15:
        continue
    filtered_sentences.append(sentence)

In [None]:
filtered_sentences

['Contributing contributions welcomed thank taking time contribute project set guidelines contributing development Slips contribute Run Slips report bugs needed features suggest ideas Pull requests solved GitHub issue new feature Pull request new detection module Persistent Git Branches following git branches Slips repository permanent master contains stable version Slips new versions least month develop contains latest unstable version Slips also latest features new features based branch Naming Git branches Pull Requests keep Git history clean facilitate revision contributions ask branches follow concise namings branch-naming patterns follow contributing Slips author-bugfix- pull request branch contains bugfix author-docs- pull request branch contains documentation work author-enhance- pull request branch contains enhancement new feature improvement nonetheless author-feature- pull request branch contains new feature author-refactor- pull request branch contains code refactoring branc

In [None]:
len(filtered_sentences)

749

In [None]:
filtered_sentences[0]

'Contributing contributions welcomed thank taking time contribute project set guidelines contributing development Slips contribute Run Slips report bugs needed features suggest ideas Pull requests solved GitHub issue new feature Pull request new detection module Persistent Git Branches following git branches Slips repository permanent master contains stable version Slips new versions least month develop contains latest unstable version Slips also latest features new features based branch Naming Git branches Pull Requests keep Git history clean facilitate revision contributions ask branches follow concise namings branch-naming patterns follow contributing Slips author-bugfix- pull request branch contains bugfix author-docs- pull request branch contains documentation work author-enhance- pull request branch contains enhancement new feature improvement nonetheless author-feature- pull request branch contains new feature author-refactor- pull request branch contains code refactoring branch

#Training

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
!pip install sacremoses
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895259 sha256=cbd9b7cadca5eb9d11a14d21d56824e3fbb182c29a6fff50f0cf33a766e126e4
  Stored in directory: /root/.cache/pip/wheels/12/1c/3d/46cf06718d63a32ff798a89594b61e7f345ab6b36d909ce033
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
model = BertForMaskedLM.from_pretrained('bert-large-uncased')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
max_length = model.config.max_position_embeddings
print("The maximum input size supported by the model is:", max_length)

The maximum input size supported by the model is: 512


In [None]:
# filtered_sentences

In [None]:
import nltk
nltk.download('punkt')

new_filt_sentences = []

for sentence in filtered_sentences:
    if len(sentence) <= max_length:
        new_filt_sentences.append(sentence)
    else:
        # Split the long sentence into shorter sentences
        sub_sentences = nltk.sent_tokenize(sentence)
        for sub_sentence in sub_sentences:
            if len(sub_sentence) <= max_length:
                new_filt_sentences.append(sub_sentence)
            else:
                start = 0
                while start < len(sub_sentence):
                    end = start + max_length
                    if end > len(sub_sentence):
                        end = len(sub_sentence)
                    truncated_sub_sentence = sub_sentence[start:end]
                    new_filt_sentences.append(truncated_sub_sentence)
                    start = end

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
len(filtered_sentences)

749

In [None]:
len(new_filt_sentences)

825

In [None]:
cnt=0
for f in filtered_sentences: cnt+=len(f)
print(cnt)
cnt=0
for f in new_filt_sentences: cnt+=len(f)
print(cnt)

128773
128756


In [None]:
cnt=0
for f in new_filt_sentences: 
    if(len(f)>max_length): cnt+=1
print(cnt)

0


In [None]:
import torch

tokenized_data = [tokenizer.encode(text, add_special_tokens=True) for text in new_filt_sentences]


In [None]:
import random

# Mask a random percentage of tokens in each sequence
masked_data = []
for sequence in tokenized_data:
    masked_sequence = sequence.copy()
    num_masks = int(len(sequence) * 0.15)  # Mask 15% of the tokens
    indices = random.sample(range(1, len(sequence) - 1), num_masks)  # Exclude special tokens [CLS] and [SEP]
    for index in indices:
        if random.random() < 0.8:  # 80% of the time, replace the token with [MASK]
            masked_sequence[index] = tokenizer.mask_token_id
        elif random.random() < 0.5:  # 10% of the time, replace the token with a random token
            masked_sequence[index] = random.randint(0, len(tokenizer) - 1)
    masked_data.append(masked_sequence)


In [None]:
# Pad the sequences that are shorter than max_length
padded_data = []
for sequence in masked_data:
    if len(sequence) < max_length:
        sequence += [tokenizer.pad_token_id] * (max_length - len(sequence))
    padded_data.append(sequence)

# Convert the padded data to PyTorch tensors
input_ids = torch.tensor(padded_data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# from transformers import AutoTokenizer, AutoModelForMaskedLM
# from transformers import BertTokenizer, BertForMaskedLM

# model_path = '/content/drive/MyDrive/model'

# # Load the model from the specified path
# tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
# model = BertForMaskedLM.from_pretrained(model_path)

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.optim import AdamW
from tqdm import tqdm
from torch.utils.data import DataLoader

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index]

# Split the input ids into batches
batch_size = 7
input_ids = torch.tensor(input_ids)  # convert to tensor if it's not already
dataset = TextDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

epochs=3
# Loop through the data by batch
for epoch in range(epochs):
    for batch in tqdm(dataloader):  # Add tqdm to loop for progress bar
        batch = batch.to(device)  # Move the batch to the GPU

        # Set 15% of tokens to be masked
        mask_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool()
        batch[mask_indices] = tokenizer.mask_token_id

        # Set another 15% of tokens to be randomly replaced
        random_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool() & ~mask_indices
        random_tokens = torch.randint(len(tokenizer), batch.shape, dtype=torch.long).to(device)  # Move to the GPU
        batch[random_indices] = random_tokens[random_indices]

        # Clear gradients and compute forward pass
        optimizer.zero_grad()
        loss = model(input_ids=batch, labels=batch)[0]

        # Backpropagate the loss and update the weights
        loss.backward()
        optimizer.step()

    #Test after each epoch
    # Ask a question
    question = "install"
    question_tokens = tokenizer.tokenize(question)
    question_ids = torch.tensor([tokenizer.convert_tokens_to_ids(question_tokens)])

    # Move question_ids to the GPU
    question_ids = question_ids.to(device)

    # Get the model's predictions for the question
    with torch.no_grad():
        outputs = model(input_ids=question_ids)
        predictions = outputs[0]

    # Get the predicted tokens for the next k tokens after the masked token
    k = 15
    predicted_indices = torch.argsort(predictions[0, -1], descending=True)[:k]
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices)

    # Print the question and predicted answers
    print("Question:", question)
    print("Top-{} predicted answers:".format(k))
    print(predicted_tokens)


  input_ids = torch.tensor(input_ids)  # convert to tensor if it's not already
100%|██████████| 118/118 [04:44<00:00,  2.41s/it]


Question: install
Top-15 predicted answers:
['##dm', '##hari', '##hmi', '##igan', '##chel', '##yst', '##tri', '##hore', '##tray', '##tyn', '##combe', '##hman', 'pleistocene', '##dai', '##lot']


100%|██████████| 118/118 [04:45<00:00,  2.42s/it]


Question: install
Top-15 predicted answers:
['##edly', '##planes', '##icides', '##inatory', '##tropical', '##ᵀ', '##hyllum', 'wealthiest', '##partisan', '##ocene', '##tively', '##nidae', '##quitable', '##ieving', 'townland']


100%|██████████| 118/118 [04:44<00:00,  2.41s/it]


Question: install
Top-15 predicted answers:
['##edly', '##planes', '##icides', '##inatory', '##ᵀ', '##tropical', 'wealthiest', '##partisan', '##hyllum', '##ocene', '##nidae', '##tively', '##quitable', 'townland', '##ieving']


In [None]:
# Replace <model_path> with the path where you want to save the model
model_path = '/content/drive/MyDrive/model'

# Save the model to the specified path
model.save_pretrained(model_path)

In [None]:
####TRAIN MORE

# # Define the optimizer
# optimizer = AdamW(model.parameters(), lr=e-3)

# # Split the input ids into batches
# batch_size = 16
# input_ids = torch.tensor(input_ids)  # convert to tensor if it's not already
# dataset = TextDataset(input_ids)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# epochs=3
# # Loop through the data by batch
# for epoch in range(epochs):
#     for batch in tqdm(dataloader):  # Add tqdm to loop for progress bar
#         batch = batch.to(device)  # Move the batch to the GPU

#         # Set 15% of tokens to be masked
#         mask_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool()
#         batch[mask_indices] = tokenizer.mask_token_id

#         # Set another 15% of tokens to be randomly replaced
#         random_indices = torch.bernoulli(torch.full(batch.shape, 0.15)).bool() & ~mask_indices
#         random_tokens = torch.randint(len(tokenizer), batch.shape, dtype=torch.long).to(device)  # Move to the GPU
#         batch[random_indices] = random_tokens[random_indices]

#         # Clear gradients and compute forward pass
#         optimizer.zero_grad()
#         loss = model(input_ids=batch, labels=batch)[0]

#         # Backpropagate the loss and update the weights
#         loss.backward()
#         optimizer.step()
#     # Ask a question
#     question = "install"
#     question_tokens = tokenizer.tokenize(question)
#     question_ids = torch.tensor([tokenizer.convert_tokens_to_ids(question_tokens)])

#     # Move question_ids to the GPU
#     question_ids = question_ids.to(device)

#     # Get the model's predictions for the question
#     with torch.no_grad():
#         outputs = model(input_ids=question_ids)
#         predictions = outputs[0]

#     # Get the predicted tokens for the next k tokens after the masked token
#     k = 15
#     predicted_indices = torch.argsort(predictions[0, -1], descending=True)[:k]
#     predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices)

#     # Print the question and predicted answers
#     print("Question:", question)
#     print("Top-{} predicted answers:".format(k))
#     print(predicted_tokens)


In [None]:
# Ask a question
question = "Slips install"
question_tokens = tokenizer.tokenize(question)
question_ids = torch.tensor([tokenizer.convert_tokens_to_ids(question_tokens)])

# Move question_ids to the GPU
question_ids = question_ids.to(device)

# Get the model's predictions for the question
with torch.no_grad():
    outputs = model(input_ids=question_ids)
    predictions = outputs[0]

# Get the predicted tokens for the next k tokens after the masked token
k = 2
predicted_indices = torch.argsort(predictions[0, -1], descending=True)[:k]
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices)

# Print the question and predicted answers
print("Question:", question)
print("Top-{} predicted answers:".format(k))
print(predicted_tokens)

Question: Slips install
Top-2 predicted answers:
['##edly', '##planes']
