Adapted from https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb

Note: The script uses Berkeley Neural Parser to parse the generated instructions, and visualize the results using Plotly.

Please make sure to install benepar following their documentation [here](https://github.com/nikitakit/self-attentive-parser#installation).

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
import nltk
import spacy
import benepar

In [4]:
lang = 'en'

if lang == 'en':
    spacy_model = 'en_core_web_md'
    benepar_model = 'benepar_en3'
elif lang == 'de':
    spacy_model = 'de_core_news_md'
    benepar_model = 'benepar_de2' # https://github.com/nikitakit/self-attentive-parser/issues/103#issuecomment-1701814831

# nlp = spacy.load('en_core_web_md')
nlp = spacy.load(spacy_model, disable=['ner'])
# benepar.download(benepar_model)
nlp.add_pipe("benepar", config={"model": benepar_model})

LookupError: 
**********************************************************************
  Resource [93mbenepar_en3[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> benepar.download('benepar_en3')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mmodels/benepar_en3[0m

  Searched in:
    - '/home/user/shaita/nltk_data'
    - '/home/user/shaita/anaconda3/envs/torch1/nltk_data'
    - '/home/user/shaita/anaconda3/envs/torch1/share/nltk_data'
    - '/home/user/shaita/anaconda3/envs/torch1/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [6]:
de_verb_tags = {
    "VVFIN": "finites Verb, voll",
    "VVIMP": "Imperativ, voll",
    "VVINF": "Infinitiv, voll",
    "VVIZU": "Infinitiv mit “zu”, voll",
    "VVPP": "Partizip Perfekt, voll",
    # "VAFIN": "finites Verb, aux",
    # "VAIMP": "Imperativ, aux",
    # "VAINF": "Infinitiv, aux",
    # "VAPP": "Partizip Perfekt, aux",
    # "VMFIN": "finites Verb, modal",
    # "VMINF": "Infinitiv, modal",
    # "VMPP": "Partizip Perfekt, modal"
    }

de_noun_tags = {
    "NN": "normales Nomen",
    "NE": "Eigennamen",
}

def find_root_verb_and_its_dobj_de(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.tag_ in de_verb_tags.keys():
        for child in tree_root.children:
            if child.dep_ == "oa" and child.tag_ in de_noun_tags.keys():
                # german dobj is "oa" https://aclanthology.org/W08-1007.pdf
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj_de(child)
    # if no children satisfy the condition, return None
    return None, None


def find_root_verb_and_its_dobj_en(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj_en(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s, lang='en'):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    if lang == 'en':
        return find_root_verb_and_its_dobj_en(first_sent.root)
    elif lang == 'de':
        return find_root_verb_and_its_dobj_de(first_sent.root)

# print(find_root_verb_and_its_dobj("Erzähl mir einen lustigen Witz!", lang=lang))
# print(find_root_verb_and_its_dobj("Erzähl mir einen lustigen Witz über die Schule!", lang=lang))
# print(find_root_verb_and_its_dobj("Erzähl mir einen lustigen Witz über die Schule, bitte!", lang=lang))
# print(find_root_verb_and_its_dobj("Erzähl mir einen lustigen Witz über die Schule, bitte, und schreib ihn auf!", lang=lang))
# print(find_root_verb_and_its_dobj("Kannst du mir einen lustigen Witz über die Schule erzählen?", lang=lang))
# print(find_root_verb_and_its_dobj("Wie kamen die US-Bundesstaaten zu ihren Namen?", lang=lang))
print(find_root_verb_and_its_dobj_in_string("Write me a story about education.", lang=lang))
print(find_root_verb_and_its_dobj_in_string("Can you write me a story about education.", lang=lang))
print(find_root_verb_and_its_dobj_in_string("Why do horses sleep standing up?", lang=lang))

('write', 'story')
('write', 'story')
('sleep', None)


In [31]:
import pandas as pd
import json
import tqdm

if lang == 'en':
    generated_data_path1 = "../../InstructionWild/data_v2/user_1.jsonl"
    generated_data_path2 = "../../InstructionWild/data_v2/user_2.jsonl"
    generated_data_path3 = "../../InstructionWild/data_v2/user_3.jsonl"
    generated_data_path4 = "../../InstructionWild/data_v2/user_4.jsonl"
elif lang == 'de':
    generated_data_path = "../resources/data/alpaca_eval/alpaca_eval_instructions_de.json"

instructions = []
            
for f in [generated_data_path4]:
    with open(f, 'r', encoding='utf8') as fin:
        for line in fin:
            try:
                instructions.append(json.loads(line.strip())["instruction"])
            except:
                instructions.append(line.strip())

instructions = list(set(instructions))
print(len(instructions))
print(instructions[:5])

103787
['did ioseb geldiashvili win a poker tournament', 'Help me organize the following paragraphs into English keywords that midjourney can understand. Use commas to separate the keywords. Next, I will start to describe the keywords: a monkey lying on a girl, looking at the sky, high-definition, photo style, Blurred background, soft light', 'Write a serious article with silly information, and please include a detailed, eye-catching title and opening statement.', 'Compare the differences and similarities between Kotlin/Swift', 'write a report about the natural community for a biology lesson']


In [32]:
raw_phrases = []
for instruction in tqdm.tqdm(instructions):
    try:
        verb, noun = find_root_verb_and_its_dobj_in_string(instruction)
        raw_phrases.append({
            "verb": verb,
            "noun": noun,
            "instruction": instruction
        })
    except Exception as e:
        print(e)
        print(instruction)

100%|██████████| 103787/103787 [06:18<00:00, 274.53it/s]


In [25]:
raw_phrases = pd.DataFrame(raw_phrases)
phrases = pd.DataFrame(raw_phrases).dropna()
phrases[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

verb   noun       
write  essay          1562
       code           1383
       story           982
       poem            823
       script          749
                      ... 
get    vector            1
       video             1
       visa              1
       volume            1
将just  exploration       1
Length: 16865, dtype: int64

In [26]:
top_verbs = phrases[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

df = phrases[phrases["verb"].isin(top_verbs["verb"].tolist())]
# df = df[~df["noun"].isin(["I", "what"])]
# df = phrases
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
df = df.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
# df = df[df["count"] > 10]
df = df.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
df

Unnamed: 0,verb,noun,count
0,add,text,31
1,add,information,22
2,add,paragraph,14
3,add,code,11
4,create,script,83
...,...,...,...
75,use,language,20
76,write,essay,1562
77,write,code,1383
78,write,story,982


In [27]:
df['count'] = df['count'] * 100
df

Unnamed: 0,verb,noun,count
0,add,text,3100
1,add,information,2200
2,add,paragraph,1400
3,add,code,1100
4,create,script,8300
...,...,...,...
75,use,language,2000
76,write,essay,156200
77,write,code,138300
78,write,story,98200


In [30]:

import plotly.graph_objects as go
import plotly.express as px

# df["blank"] = "ROOT"
# df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})
# df['count'] = df['count'] * 100
df = df[df["count"] > 30]
fig = px.sunburst(df, path=['verb', 'noun'], values='count', color='verb', color_discrete_sequence=px.colors.qualitative.Pastel)
# fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.update_layout(
    margin=dict(l=0, r=0, t=0, b=0),
    # font_family="Times New Roman",
)
fig.show()
# fig.write_html(f"verb_noun-{lang}.html")
# fig.savefig("verb_noun.pdf")

fig.write_image(f"data/verb_noun-{lang}.png")
# fig.to_image(format="png", width=600, height=350, scale=2)

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


In [13]:
instructions


['Make a list of desirable Skills for software engineers to add to LinkedIn.',
 'Why so many people like film photography when digital camera and mobile phone are more convenient and generate higher quality of photos?',
 'The sentence you are given might be too wordy, complicated, or unclear. Rewrite the sentence and make your writing clearer by keeping it concise. Whenever possible, break complex sentences into multiple sentences and eliminate unnecessary words.\n\nIf you have any questions about my rate or if you find it necessary to increase or decrease the scope for this project, please let me know.',
 "How can anti-deressants make people think or act suicidally? Suicidal thoughts or actions' are a side effect of every single anti-depressant I've seen. Is the medicine controlling your mind or something?",
 'explain what the theory of sexual selection is and give an example.',
 'which libraries are the best for developing deep learning scripts in python?',
 'How many atoms are in a 

In [18]:
from collections import defaultdict

questions = defaultdict(list)
for instruction in instructions:
    instruction = instruction.strip().lower()
    if instruction.startswith("who"):
        questions["who"].append(instruction)
    elif instruction.startswith("what"):
        questions["what"].append(instruction)
    elif instruction.startswith("when"):
        questions["when"].append(instruction)
    elif instruction.startswith("where"):
        questions["where"].append(instruction)
    elif instruction.startswith("why"):
        questions["why"].append(instruction)
    elif instruction.startswith("how"):
        questions["how"].append(instruction)
    else:
        questions["non_qu"].append(instruction)

print(len(questions))

7


In [19]:
for key in questions.keys():
    print(key, len(questions[key]))

non_qu 585
why 21
how 76
what 104
where 3
when 7
who 9


In [20]:
21 + 76 + 104 + 3 + 7 + 9

220