In [None]:
from google.colab import output
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import subprocess
import pandas as pd
import transformers
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

### Setup Model

In [None]:
!pip install git+https://github.com/huggingface/transformers torch accelerate bitsandbytes langchain

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
command = "pip install -U sentence-transformers chromadb"
process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)
# Download the Mistral 7B Instruct Model and Tokenizer
model_name = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"
model = AutoModelForCausalLM.from_pretrained(model_name,load_in_4bit=True,torch_dtype=torch.bfloat16,quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Integrating the Model with LangChain


In [None]:
#Create Text Generation Pipeline
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    # "temperature" is a hyperparameter that controls the randomness of the generated text. It modulates the probability distribution of the next token predicted by the model.
    temperature = 0.2,
    repetition_penalty = 1.1,
    # To generate whole text at once in single string format
    return_full_text=True,
    max_new_tokens=1000,
)

In [None]:
#Creating an LLM instance
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
# Downloading embedding model
embedding_model = SentenceTransformerEmbeddings(model_name='BAAI/bge-large-zh-v1.5')

In [None]:
df=pd.read_csv("/content/mix.csv")
train_df=pd.read_csv("/content/drive/MyDrive/Adobe/Training_Data _new_imgcap_1-52k.csv")
prompt_list=df["prompts"].tolist()

In [None]:
prompt_withcaps=[]
for i in range(1956):
  prompt_withcaps.append(df.at[i,"prompt"]+"\n\ntweet content :"+train_df.at[i+25000,"content"])

In [None]:
# Chunck Documents
# Create Document object from text documents
docs = [Document(page_content=post) for post in prompt_list[:1000]]
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=10, separators=['\n\n', '\n', '.']
)
document_chunks = text_splitter.split_documents(docs)

In [None]:
#Initiate a Vector store Instance
# Initiate a chromadb instance
chroma_db = Chroma.from_documents(document_chunks, embedding_model)
retriever = chroma_db.as_retriever()
# Define an LLM here
llm = mistral_llm

In [None]:
#option 2
# Prompt template
qa_template = """<s>[INST] You are a guiding, helpful and honest assistant. Answer exactly in few words from the context
Answer the question below from context below :
{context}
</s>
[INST] {question} [/INST]
"""

# Create a prompt instance
QA_PROMPT = PromptTemplate.from_template(qa_template, additional_fields=['question', 'context'])


# Custom QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": QA_PROMPT}
    )

In [None]:
question_pp = " Generate an engaging tweet similar to the one posted by AMCTheatres, which includes an image. The tweet was posted on Thursday, 2019-03-14, at 15:08:51 and has received 68 likes. The tweet mentions the company amc. The media can be described as *nan*. The visual content from the media if obtained, is as follows, **, ** What is the tweet content that captures the essence of the media given?"
response = qa_chain({"query": question_pp ,"context":prompt_list})
print(response['result'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



"Get ready for an unforgettable movie experience with @AMCTheatres! Check out this stunning photo of our latest theater interior. #AMCTheaters #MovieMagic #CinemaLove"
