In [1]:
import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=True,
    device_map='auto'
    # device_map={"": PartialState().process_index},
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
from datasets import load_dataset
from random import randrange

cosmopedia = load_dataset("HuggingFaceTB/cosmopedia", "stories", split="train[:3000]") # [:10%] 3000

print(f"dataset size: {len(cosmopedia)}")
print(cosmopedia[randrange(len(cosmopedia))])

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

dataset size: 3000
{'text': " I've always been fascinated by insects, but ants have held a special place in my heart since I was young. My grandmother used to tell me stories about them while we tended her garden together. She would point out different species and explain how they worked as a collective unit - all thanks to those magical chemicals called pheromones.\n\nOne summer day, armed with my trusty magnifying glass (for close observation only!) and notebook, I decided to conduct an experiment inspired by Grandma's tales. I placed a piece of chocolate on one side of our yard, hoping to observe firsthand how these tiny creatures communicated using trail pheromones. It wasn't long before an adventurous ant found the treat and began laying down a scent trail back to the nest. Soon enough, a line of ants marched towards the chocolate, creating what looked like a miniature highway of industry.\n\nAs I watched, engrossed in their organized chaos, I noticed something peculiar happening 

In [5]:
def create_prompt(example):
    # Customize the prompt based on your specific needs
    prompt_QA = f"""Story: {example['text']}
      
    Instructions:
    1. Summarize the key details of the story above.
    2. Based on the summary, create a question that captures a significant detail or event in the story. Keep the question concise.
    3. Answer the question based on the story, using only a single phrase if possible.
    4. I need only Question and Answer
    
    ---
    
    Question:
    Answer:
    """

    prompt_Summary_con = f"""You will be provided with a passage called Context. \
    Your task is to create a concise summary that encapsulates the main points and core ideas of the passage in a clear and coherent manner. \
    Ensure that the summary highlights any key events, lessons, or characters involved, using your own words.

    Guidelines:
    1. Focus on the most important details and avoid unnecessary elaboration.
    2. Do not include minor details or specific examples unless they are crucial to understanding the overall context.
    
    ---
    
    Context: {example['text']}
    Summary: Your concise summary here
    """

    prompt_Summary = f"""Task: You will be provided with a passage called Context. Your task consists of two parts:
    1. **Generate a Question or Instruction**: Create a single, clear question or instruction \
    that would prompt a summary of the given context.
    2. **Provide the Summary**: Based on the generated question or instruction, \
    write a concise summary that captures the main points and essence of the passage.

    Important:
    - If you encounter any difficulties or if the question/instruction is unclear, \
    proceed to create a summary of the context anyway.
    - **Always** produce a summary for the given context. \
    If you are unsure, summarize the core points as you understand them.

    
    Context:
    {example['text']}

    Generated Question/Instruction: Your concise generated question or instruction here

    Summary: Your concise summary here

    ---

    Example 1:
    Generated Question/Instruction: "Discuss the key points of how Mia and Eric explored the integration of Artificial Intelligence (AI) in businesses, implementing it in Eric's startup."

    Summary: Mia, a business consultant, and Eric, an entrepreneur, delved into the impact of AI on businesses over several meetings. \
    Eric initially expressed skepticism about over-reliance on machines due to potential job losses and the importance of human interaction. \
    However, Mia highlighted the benefits of AI such as improved productivity, cost savings, and enhanced decision-making abilities. \
    They agreed to use technology to augment rather than replace human capabilities. \
    Inspired by their discussions, they implemented changes in Eric's eco-friendly startup. \
    Initially, they introduced chatbots for customer queries, freeing up team members for complex issues. \
    This strategy proved successful as customers appreciated quick responses while employees felt valued. \
    They then leveraged machine learning for market trend analysis but ensured final decisions remained human-led. \
    This approach helped Eric's company thrive, demonstrating that AI could be utilized without compromising humanity or values.</s>

    ---

    Example 2:
    Generated Question/Instruction: Summarize the story of Pirate Rick's quest to find and reclaim his treasure, \
    and include the lessons learned from his experiences.

    Summary: Pirate Rick buried his treasure after a long search, but a tropical storm and a tsunami altered its location. \
    Determined, Rick resumed digging and spent three hours uncovering his loot, only to find it stolen by other pirates. \
    This series of events teaches us that, despite our efforts, unforeseen circumstances can make goals difficult to achieve. \
    However, learning from these experiences contributes to personal growth and problem-solving skills.</s>

    ---

    Example 3:
    Generated Question/Instruction: Explain how Jack and Oliver's discussion about different car models, \
    specifically Model X and Model Y, led to a change in their approach when consulting clients.

    Summary: Jack and Oliver, two car enthusiasts from Autoville, debated the superiority of Model X over Model Y based on minor differences in speed. \
    Jack calculated percentages to prove his point but Oliver argued that real-world conditions like wind resistance and terrain were more significant. \
    They decided to emphasize other aspects such as safety features, fuel efficiency, comfort, design, and overall value when advising clients. \
    This balanced approach led to increased trust and satisfaction among the community. \
    The incident served as a reminder that understanding context is crucial and became a lesson for them and those seeking their guidance. \
    Subsequently, Jack and Oliver ensured considering all relevant factors before jumping to conclusions.</s>

    ---

    Example 4:
    Generated Question/Instruction: "Describe the situation Nathan Drake is in and how he uses strategy to survive."

    Summary: Nathan Drake, a protagonist who has presumably faced numerous challenging situations before, finds himself hiding from an armored enemy. With limited ammunition, he recalls his mentor's advice about thinking creatively. \
    He lures the guard away by shooting near him, then uses the opportunity to dash across an open space towards cover. \
    After the guard returns, Drake stealthily approaches and knocks him out. \
    This scenario highlights Drake's survival skills and strategic thinking, although it also raises questions about the morality of his actions.</s>

    ---

    Example 5:
    Generated Question/Instruction: "Describe the creative process that the marketing team for GreenTech used to overcome their struggle in creating a new campaign slogan."

    Summary: The GreenTech marketing team, led by their manager Sarah, was stuck in creating a new campaign slogan. \
    To stimulate creativity, Sarah introduced mind mapping where each member wrote down related ideas branching off from the main goal of increasing sustainability awareness. \
    They found a recurring theme of nature in their mind maps. Next, they did a random word exercise using the word "ocean," which led to associations of growth and change. \
    Applying this to their task, they conceptualized a campaign focusing on how adopting sustainable practices allows businesses and communities to grow and adapt. \
    The day ended with role-playing exercises, resulting in a powerful campaign centered around the theme of growth and adaptation.</s>
    """

    prompt_trec = f"""You are a dataset generator. I want a dataset similar to the TREC dataset, which contains 50 fine-grained question types. 
    Here is the full list of question types:
    
    - Abbreviation (e.g., Expression abbreviated)
      - Animal
      - Organ of body
      - Color
      - Invention, book, and other creative pieces
      - Currency name
      - Disease and medicine
      - Event
      - Food
      - Musical instrument
      - Language
      - Letter like a-z
      - Plant
      - Product
      - Religion
      - Sport
      - Element and substance
      - Symbols and sign
      - Techniques and method
      - Equivalent term
      - Vehicle
      - Word with a special property
      - Definition of something
      - Description of something
      - Manner of an action
      - Reason
      - Group or organization of persons
      - Individual
      - Title of a person
      - Description of a person
      - City
      - Country
      - Mountain
      - Other location
      - State
      - Postcode or other code
      - Number of something
      - Date
      - Distance, linear measure
      - Price
      - Order, rank
      - Other number
      - Lasting time of something
      - Percent, fraction
      - Speed
      - Temperature
      - Size, area, and volume
      - Weight
    
    ### Task:
    Given a Context, generate a question based on the information provided. \
    Then, decide on the most appropriate question type from the TREC list based on the nature of the question. 
    Ensure that:
    1. The question is directly related to the Context provided.
    2. The question type matches the specific category from the list of TREC question types.
    3. Use detailed and relevant examples to guide your question generation, avoiding generic or vague questions.
    4. Be concise and precise with both the question and the question type selection.
    
    ### Example:
    Context: 
    {example['text']}
    
    ---
    
    Question: 
    [Write a detailed and specific question based on the context]
    
    Question Type: 
    [Select one from the list above, matching the question generated]

    """
    
    
    messages = [{"role": "user", "content": prompt_trec}]

    return messages

In [6]:
from tqdm import tqdm

result = []

for i in tqdm(range(len(cosmopedia)), leave=False):
    messages = create_prompt(cosmopedia[i])
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    generated_ids = model.generate(inputs, max_new_tokens=4096, temperature=0.2, top_p=0.9, do_sample=True)

    generated_text = tokenizer.decode(generated_ids[0].tolist())
    result.append(generated_text)

print(result[0])

                                                          

<s><s> [INST] You are a dataset generator. I want a dataset similar to the TREC dataset, which contains 50 fine-grained question types. 
    Here is the full list of question types:
    
    - Abbreviation (e.g., Expression abbreviated)
      - Animal
      - Organ of body
      - Color
      - Invention, book, and other creative pieces
      - Currency name
      - Disease and medicine
      - Event
      - Food
      - Musical instrument
      - Language
      - Letter like a-z
      - Plant
      - Product
      - Religion
      - Sport
      - Element and substance
      - Symbols and sign
      - Techniques and method
      - Equivalent term
      - Vehicle
      - Word with a special property
      - Definition of something
      - Description of something
      - Manner of an action
      - Reason
      - Group or organization of persons
      - Individual
      - Title of a person
      - Description of a person
      - City
      - Country
      - Mountain
      - Other locati



In [7]:
result[0]

'<s><s> [INST] You are a dataset generator. I want a dataset similar to the TREC dataset, which contains 50 fine-grained question types. \n    Here is the full list of question types:\n    \n    - Abbreviation (e.g., Expression abbreviated)\n      - Animal\n      - Organ of body\n      - Color\n      - Invention, book, and other creative pieces\n      - Currency name\n      - Disease and medicine\n      - Event\n      - Food\n      - Musical instrument\n      - Language\n      - Letter like a-z\n      - Plant\n      - Product\n      - Religion\n      - Sport\n      - Element and substance\n      - Symbols and sign\n      - Techniques and method\n      - Equivalent term\n      - Vehicle\n      - Word with a special property\n      - Definition of something\n      - Description of something\n      - Manner of an action\n      - Reason\n      - Group or organization of persons\n      - Individual\n      - Title of a person\n      - Description of a person\n      - City\n      - Country\n 

In [8]:
# 1/0
result[709]

"<s><s> [INST] You are a dataset generator. I want a dataset similar to the TREC dataset, which contains 50 fine-grained question types. \n    Here is the full list of question types:\n    \n    - Abbreviation (e.g., Expression abbreviated)\n      - Animal\n      - Organ of body\n      - Color\n      - Invention, book, and other creative pieces\n      - Currency name\n      - Disease and medicine\n      - Event\n      - Food\n      - Musical instrument\n      - Language\n      - Letter like a-z\n      - Plant\n      - Product\n      - Religion\n      - Sport\n      - Element and substance\n      - Symbols and sign\n      - Techniques and method\n      - Equivalent term\n      - Vehicle\n      - Word with a special property\n      - Definition of something\n      - Description of something\n      - Manner of an action\n      - Reason\n      - Group or organization of persons\n      - Individual\n      - Title of a person\n      - Description of a person\n      - City\n      - Country\n 

In [9]:
import pandas as pd

generated_dataset = pd.DataFrame()

In [10]:
# DocQA_cos

# question = []
# answer = []
# content = []

# for i in range(len(result)):
#     text = result[i].split('[/INST]')[1].split('Question:')[1]
#     if 'Answer' in text:
#         question.append(text.split('Answer:')[0])
#         answer.append(text.split('Answer:')[1].split('</s>')[0])
#         content.append(cosmopedia[i]['text'])
#     else:
#         continue


# generated_dataset['Question'] = question
# generated_dataset['Answer'] = answer
# generated_dataset['Content'] = content

In [11]:
# summary_con_cos

# summary = []
# content = []

# for i in range(len(result)):
#     text = result[i].split('[/INST]')[1].split('</s>')[0]
#     summary.append(text)
#     content.append(cosmopedia[i]['text'])

# generated_dataset['Summary'] = summary
# generated_dataset['Content'] = content

In [12]:
# summary_cos

# summary = []
# input_seq = []
# content = []

# for i in range(len(result)):
#     print(i)
#     text = result[i].split('[/INST]')[1].split('Generated Question/Instruction:')[1]
#     if 'Summary:' in text:
#         input_seq.append(text.split('Summary:')[0])
#         summary.append(text.split('Summary:')[1].split('</s>')[0])
#         content.append(cosmopedia[i]['text'])
#     else:
#         continue

# generated_dataset['Summary'] = summary
# generated_dataset['Input'] = input_seq
# generated_dataset['Content'] = content

In [13]:
result[1].split('[/INST]')[1]

" Question: Why might the Wordlings consider words without the letter 'e' to have stronger meanings?\nQuestion Type: Word with a special property</s>"

In [None]:
# TREC_cos

question = []
type = []
content = []

for i in range(len(result)):
    text = result[i].split('[/INST]')[1].split('Question:')[1]
    #print(i)
    if 'Question Type' in text:
        question.append(text.split('Question Type:')[0])
        type.append(text.split('Question Type:')[1].split('</s>')[0])
        content.append(cosmopedia[i]['text'])
    else:
        continue


generated_dataset['Question'] = question
generated_dataset['Question_Type'] = type
generated_dataset['Content'] = content

In [15]:
# TriviaQA_cos



In [16]:
# SAMSum_cos

In [17]:
# PassageCount_cos

In [18]:
# PassageRetrieval-en_cos

In [19]:
file_name = [
    'DocQA_cos', # NarrativeQA Qasper MultiField-en HotpotQA 2WikiMultihopQA MuSiQue
    'summay_con_cos', # GovReport MultiNews
    'summay_cos', # QMSum_cos
    'TREC_cos',
    'TriviaQA_cos',
    'SAMSum_cos',
    'PassageCount_cos',
    'PassageRetrieval-en_cos',
]
# 0 1 2 3

generated_dataset.to_csv('./dataset/'+file_name[3]+'.csv', encoding='utf-8',)