install libraries

In [1]:
!pip install -q transformers
!pip install -q "datasets<=2.18.0"
!pip install -q sentence-transformers
!pip install -q langchain
!pip install -q langchain_community
!pip install -q langchainhub
!pip install -q torch
!pip install -q numpy
!pip install -q faiss-gpu
!pip install -q "bitsandbytes<=0.40.2"
!pip install -q accelerate

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import numpy as np

In [2]:
from datasets import load_dataset

# List available datasets and configurations
imdb_dataset = load_dataset("imdb")

# Print available splits to ensure correct loading
print(imdb_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
import re

# Define the word or phrase you want to filter by
specific_word = "Star Wars"

# Define a function to filter texts containing the specific word or phrase
def filter_texts(example):
    return bool(re.search(rf'\b{specific_word}\b', example['text'], re.IGNORECASE))

# Apply the filter to the train and test datasets
filtered_train_dataset = imdb_dataset['train'].filter(filter_texts)
filtered_test_dataset = imdb_dataset['test'].filter(filter_texts)
filtered_unsupervised_dataset = imdb_dataset['unsupervised'].filter(filter_texts)

# Verify the filtered dataset
print(filtered_train_dataset)
print(filtered_test_dataset)
print(filtered_unsupervised_dataset)

print(filtered_train_dataset[0])
print(filtered_test_dataset[0])
print(filtered_unsupervised_dataset[0])

Dataset({
    features: ['text', 'label'],
    num_rows: 159
})
Dataset({
    features: ['text', 'label'],
    num_rows: 153
})
Dataset({
    features: ['text', 'label'],
    num_rows: 306
})
{'text': 'As a kid I did think the weapon the murderer wielded was cool, however I was a kid and so I was a bit dumb. Even as a dumb kid though the movies plot was stupid and a bit boring when the killer was not using his light knife to kill people. What amazes me is that the movie has a really solid cast in it. What script did they read when agreeing to be in this movie as it is most assuredly boring and only a means to show off a light saber on a very small scale. The plot at times is incomprehensible and the end is totally chaotic. The whole film seems to rotate around aliens and the one weapon. The plot has two kids and some dude having an alien encounter, flash years later and there seems to be a return as it were in the mix. Dead animals and such to be explored and for some reason the one du

In [4]:
from datasets import concatenate_datasets

combined_dataset = concatenate_datasets([filtered_train_dataset, filtered_test_dataset, filtered_unsupervised_dataset])

print(combined_dataset)

print(combined_dataset[0])

Dataset({
    features: ['text', 'label'],
    num_rows: 618
})
{'text': 'As a kid I did think the weapon the murderer wielded was cool, however I was a kid and so I was a bit dumb. Even as a dumb kid though the movies plot was stupid and a bit boring when the killer was not using his light knife to kill people. What amazes me is that the movie has a really solid cast in it. What script did they read when agreeing to be in this movie as it is most assuredly boring and only a means to show off a light saber on a very small scale. The plot at times is incomprehensible and the end is totally chaotic. The whole film seems to rotate around aliens and the one weapon. The plot has two kids and some dude having an alien encounter, flash years later and there seems to be a return as it were in the mix. Dead animals and such to be explored and for some reason the one dude gets the weapon of the aliens and proceeds to use it to go on a very light killing spree. Seriously, you just have to wonder 

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex=False,
    separators=[
        "\n\n",
        ".",
        "\n",
        " ",
        "",
    ]
)

# Extract texts from the dataset
texts = combined_dataset["text"]

# Concatenate all texts into a single string
concatenated_text = " ".join(texts)

texts2 = text_splitter.create_documents([concatenated_text])

print(texts2[:2])
# print(texts2[1])




In [6]:
from langchain.embeddings import CacheBackedEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id
)

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id
)

vector_store = FAISS.from_documents(texts2, embedder)



In [7]:
query = "What is Star Wars?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

. Star Wars is more than a movie. it's an idea.<br /><br />How, may you ask? i shall explain. star wars touches on the most universal of stereotypes, good vs evil. it does this so obviously, so profoundly, that literally any person from any environment can understand. Episode VI does the very well, concluding the epic struggle between a son and his used and manipulated father, yet also, with the addition of the prequels, reveals even more to the hinted back story. suddenly, it's Darth Vader at the front, and viewers realize that it's the story about Anakin, not just Luke. but even before 1-3, there was amazing depth to it all. it felt real, as if capsule fell from the sky into Lucas's lap, detailing a historical account of a galaxy far, far away.<br /><br />Star Wars is definitely something far above the norm, and i must admit, whenever i see them, particularly this one, i feel very small. i feel as though i've been thrust into a world where good and evil are so clearly defined
...it's

In [8]:
query = "Who is darth vader?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

.<br /><br />By the time we get to the third act, though, the pace picks up again, as we intercut between the Ewoks battle against the troops, Lando and the Rebel Forces launching an attack against the Empire's all-new half-completed Death Star, and Luke's final showdown with Darth Vader and the Emperor. The latter ties with the Jabba Palace sequence as the highlight of the movie. Mark Hamill flexes his acting chops once again as Luke Skywalker in these scenes, and watching him as a fully matured Jedi Knight makes for an unforgettable performance. Also, as iconic as James Earl Jones' voice as Darth Vader is, he is rivaled only by the shriveled, crone-like Emperor, played with deliciously raspy, frightening evil by Ian McDiarmid. The tension between this trio heightens the excitement of this climactic moment, which is appropriately darkly lit and menacingly underscored
. The other Rebel characters certainly work in his shadow. The romance between Leia (Carrie Fisher) and Solo is all but

In [9]:
query = "Who is luke skywalker's father?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

. Leia kills Jabba and goes after Han,Luke and Chewie as well c3po and R2. <br /><br />Everybody's safe again,Luke decides to go to Dagoba to complete his training as a Jedi,as well his promise to Yoda. The problem is that Yoda is too old and sick, since he already has 900 years old, and before he dies, Yoda says to Luke that he does not need more training,but to really be a Jedi, he must fight with Vader again. He confirms to Luke that Vader is Luke's dad, and that there is another Skywalker besides Luke. In his last moments, Yoda asks to Luke to remember his advices about the temptation of the dark side, and to Luke transmit his Jedi knowledge to other people. When Yoda dies,Obi wan's spirit shows up to Luke and tells him that Luke's father killed his good side Anakin to become Darth Vader, and also that he is more machine than a man since he became a sith. Luke stays worried about killing his own dad, and says that he feels that his father still has kindness
.<br /><br />This cute f

In [10]:
query = "Why were the sequals worse than the originals or the prequals?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

. Never fear, it appears that it's now available on commercial DVD.)<br /><br />It says a great deal about inflation in the movie business that the remake had a "small" budget of "only" $5 million. That would have been a lot of money for the original filmmakers. I also wonder why here in the States we had to wait until September of 2002 to see it when the first comments about it, from a viewer in Turkey, are from February!<br /><br />But whenever it aired, my reaction would be the same: Why did they bother to make it at all? There is so little of the original here that it is essentially a different work. They have taken the story and drained it of its blood. And what does happen goes beyond problems with temporal discontinuities and paradoxes; these people behave without logic or motivation
.). While it maintains the quality of the previous 20 chapters, Vol. II feels slightly shorter in terms of its overall length and atmosphere due to the fact that most of the situations only take pla

In [11]:
query = "Were the prequals worse than the orginals?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

.<br /><br />It's 1976 and we're still playing about in latex romper-suits. <br /><br />That's about it really. Some movies have an entertainment value in the 'so bad it's good' category. This one doesn't even manage that. It wouldn't even entertain kids. 'Crash Corrigan's' stuff from the 1930's has got more going for it. This is just about one of the dumbest things I've ever seen. Maybe not a worst movie ever contender, but if you haven't seen that many bad ones, this could easily make your Top Ten Worst List. When you consider what was achieved in 1933 with the original "King Kong", you've got to ask yourself why anyone would stoop so low as to produce this debacle. Then, taking it one step further and realizing that the quantum leap to "Star Wars" the following year achieved a new level in sci-fi entertainment, this offering will make you laugh and cry at the same time
. Elga, the beautiful simple and well intentioned lady that was forced to marry the count provides the love triangl

In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [35]:
query = "What is Star Wars?"

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

encoding = tokenizer.encode_plus(query, return_tensors="pt", padding=True)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]

# Generate text with attention mask
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_beams=1,
    num_return_sequences=1,
    temperature=1,
    top_k=50,
    top_p=1.0,
    do_sample=True
)

# Decode and print the generated text
generated_text = tokenizer.decode(output.tolist()[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is Star Wars?

The answer to your question 'why the hell do I keep putting up new characters? Why is Star Wars being seen like it's all that good? Why is this so good?'

Star Wars is also a film, a video game, an animated film, and a TV series, with everything. It's the main engine behind what we would call a film franchise - a narrative, and that's exactly the premise of the original Star Wars trilogy - but also


In [36]:
query = "What is Star Wars?"

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

encoding = tokenizer.encode_plus(query, return_tensors="pt", padding=True)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]

# Generate text with attention mask
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_beams=1,
    num_return_sequences=1,
    temperature=1.5,
    top_k=50,
    top_p=1.0,
    do_sample=True
)

# Decode and print the generated text
generated_text = tokenizer.decode(output.tolist()[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is Star Wars?

That question gets all the way to the right one on here at Star Trek, obviously, but Star Wars is like anything on television—a television movie that tells a narrative and what you learn in the story is like a book…

That is an example, no one wants to be tied back to this movie—for some reason.

Of course that happens by chance but some of its influences are as good as a movie's best stuff. Just because


In [37]:
query = "What is Star Wars?"

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

encoding = tokenizer.encode_plus(query, return_tensors="pt", padding=True)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]

# Generate text with attention mask
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_beams=1,
    num_return_sequences=1,
    temperature=3.0,
    top_k=50,
    top_p=1.0,
    do_sample=True
)

# Decode and print the generated text
generated_text = tokenizer.decode(output.tolist()[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is Star Wars? As you could hear above, many fanboy stories seem an impossible story – I'd always been impressed in 'An Middling Age'? You know one that you may've written back and made some progress. 'The Night War Trilogy', you're more correct because some had great adventures before coming through! How about yours as your career? That may go in for debate but with our love! 'Rogue-Net: Rogue Nation Redux', or if not simply the series which has


In [38]:
query = "What is Star Wars?"

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

encoding = tokenizer.encode_plus(query, return_tensors="pt", padding=True)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]

# Generate text with attention mask
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_beams=1,
    num_return_sequences=1,
    temperature=10.0,
    top_k=50,
    top_p=1.0,
    do_sample=True
)

# Decode and print the generated text
generated_text = tokenizer.decode(output.tolist()[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is Star Wars? Well most readers can answer one last questions while asking any or most or no such, at worst the same few: Do Disney movies, on their entire scale never contain action with no emotion as was rumored, in this age even such big events involving violence have their villains in question which make such massive effects practically insignificant with most Starring movies going off air on one specific spot from such scene's final few minutes; I've known such people at conventions when talking movies over a television
