In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


# Generate Training Dataset using GPT 4


In [2]:
from dotenv import load_dotenv
import os
os.chdir('./..')
print(os.getcwd())


/Users/rafaelmadrigal/Documents/Code-Work/snoop-impersonator


In [3]:
load_dotenv()

True

In [2]:
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.prompts import PromptTemplate

llm = ChatOpenAI(model='gpt-4', temperature=0.8)


INSTRUCTIONS_TEMPLATE = """Write Question and Answer Pairs optimized for finetuning Large Language Models. 
Questions are asked by a human user and should cover anything about a person's life or about any general facts. 
Answers should be limited to less than 550 characters and should be a rhyme as if you were the rapper, Snoop Dogg. 
Answers should explain things like Snoop Dogg would and if the question does not make sense, explain that you are a bit lost and make something up that is both hilarious and relevant

Below are some examples:

user: Yo Snoop, what's the best way to relax after a long day?
snoop_dogg: Roll up a blunt, take a puff, feel the stress fade away, that's enough. Put on some tunes, vibe to the beat, let the rhythm guide your feet. Kick back, chillax, let the worries disappear, ain't nothin' like smokin' good vibes in the atmosphere.

user: Snoop, how do you stay so cool all the time?
snoop_dogg: It's in my DNA, laid-back genes, you see. Can't nobody bring the chill like me.

user: Snoop, what's your favorite food when you're feeling hungry?
snoop_dogg Gotta have some soul food, keepin' it real. Mac and cheese, fried chicken, that's the deal.

{format_instructions}
{query}
"""

class Conversation(BaseModel):
    user: str = Field(description="human question about anything")
    snoop_dogg: str = Field(description="answer to the question in a rhyme as if the the person answering was Snoop Dogg")

    # You can add custom validation logic easily with Pydantic.
    @validator("user")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field

# Set up a parser + inject instructions into the prompt template.
 

INSTRUCTIONS_PROMPT = PromptTemplate(
    template=INSTRUCTIONS_TEMPLATE,
    input_variables=["query"],
    partial_variables={"format_instructions": PydanticOutputParser(pydantic_object=Conversation).get_format_instructions()},
)

json_parser = SimpleJsonOutputParser()

json_chain = INSTRUCTIONS_PROMPT | llm | json_parser
output = json_chain.invoke({"query": "give me one conversation"})
output


{'user': "Hey Snoop, what's your advice for someone who's feeling down?",
 'snoop_dogg': "When life's got you feeling low, let the Snoop Dogg flow. Stay strong, keep your head high, don’t let them haters make you cry. Just keep on pushin', don't ever stop, remember you're a boss, you're right at the top."}

In [7]:
from tqdm import tqdm

In [8]:
generated_conversations = []

for i in tqdm(range(1000)):
    output_i = json_chain.invoke({"query": "give me one conversation"})
    generated_conversations.append(output_i)



100%|██████████| 1000/1000 [56:39<00:00,  3.40s/it] 


In [9]:
import pandas as pd

df = pd.DataFrame.from_records(generated_conversations)
df.to_csv('snoop_raw.csv', index=False)

In [18]:
import re
ptn = re.compile(r'(Snoop, )')

df.user = df.user.map(lambda x: ptn.sub('', x)).map(lambda x: x[0].upper()+x[1:])
df.to_csv('snoop_raw.csv', index=False)


In [19]:
df

Unnamed: 0,user,snoop_dogg
0,Hey how do you write such dope rhymes?,"Rhymes come easy, like a cool sea breeze, Play..."
1,How do you manage to stay so productive?,"Ah, the secret's in the rhythm, the flow of th..."
2,Hey how do you handle negativity?,"When the hate be spittin', I keep my vibes hit..."
3,Can you tell me how to write a rap?,"Drop a beat, feel the rhythm, let it clap. Put..."
4,What's your take on global warming?,"Global warming, it's a fact, ain't no time to ..."
...,...,...
995,Hey how do you deal with the haters?,"Yo, when it comes to the haters, don't let 'em..."
996,Hey what's your favorite way to spend a Sunday?,"Chill at home, no need to roam, playin' record..."
997,Hey what's your favorite season of the year?,"Summer's the season, ain't no doubt. Sun's bla..."
998,What's your take on global warming?,"Global warming ain't no joke, it's like the Ea..."


In [20]:
df.columns = ['instruction', 'output']

In [24]:
df = df.assign(input='')

In [25]:
df

Unnamed: 0,instruction,output,input
0,Hey how do you write such dope rhymes?,"Rhymes come easy, like a cool sea breeze, Play...",
1,How do you manage to stay so productive?,"Ah, the secret's in the rhythm, the flow of th...",
2,Hey how do you handle negativity?,"When the hate be spittin', I keep my vibes hit...",
3,Can you tell me how to write a rap?,"Drop a beat, feel the rhythm, let it clap. Put...",
4,What's your take on global warming?,"Global warming, it's a fact, ain't no time to ...",
...,...,...,...
995,Hey how do you deal with the haters?,"Yo, when it comes to the haters, don't let 'em...",
996,Hey what's your favorite way to spend a Sunday?,"Chill at home, no need to roam, playin' record...",
997,Hey what's your favorite season of the year?,"Summer's the season, ain't no doubt. Sun's bla...",
998,What's your take on global warming?,"Global warming ain't no joke, it's like the Ea...",


In [32]:
def chat_formatting_mistral(row):
    text = f"<s>[INST] {row['instruction']} [/INST] {row['output']} </s>"
    return text

df['text'] = df.apply(chat_formatting_mistral, axis=1)


In [37]:
train = df[['instruction', 'input', 'output', 'text']]

In [38]:
train.to_csv('train.csv', index=False)