In [None]:
import os

from handler.clients.am_api.client import get_accounts
from handler.clients.am_api.requests import GetAccountRequest
from handler.clients.content_api.client import (
    get_answers,
    get_question_count,
    get_questions,
    get_section_count,
    get_sections,
    get_knowledge_bases
)
from handler.clients.content_api.requests import GetAnswerRequest, GetQuestionRequest, GetSectionRequest, GetKnowledgeBaseRequest
from handler.handler import Handler

h = Handler(token=os.environ['INTERNAL_TOKEN'])

In [None]:
# Get all 
r_account = GetAccountRequest()
accounts = get_accounts(h, r_account)
industries = [response.industry for response in accounts]
accounts_ids = [response.id for response in accounts]
print(industries)
print(accounts_ids)

In [None]:
# do the histogram of industries with matplotlib
import matplotlib.pyplot as plt
plt.hist(industries, alpha=0.5, color='g')
plt.xticks(rotation=45)
plt.xlabel('Industry')
plt.ylabel('Count')
plt.title('Histogram of Industries for Accounts')
plt.show()


# Data


In [None]:
kb_account_ids = []
kb_langs = []
kb_industries = []
kb_ids = []
kb_section_counts = []
kb_generated_question_counts = []
kb_gpt_question_counts = []

In [None]:
for i, (account_id, industry) in enumerate(zip(accounts_ids, industries)):
    r_kb = GetKnowledgeBaseRequest(account_id=account_id)
    print(f"Account {account_id} - {industry} - {i+1}/{len(accounts_ids)}")
    for kb in get_knowledge_bases(h, r_kb):
        # Disable if want to collect for all languages
        if(kb.language in ['en', 'da']):
            kb_account_ids.append(account_id)
            kb_industries.append(industry)

            kb_langs.append(kb.language)
            kb_ids.append(kb.id)


            r_sections = GetSectionRequest(knowledge_base_id=kb.id)
            section_count = get_section_count(h, r_sections)
            kb_section_counts.append(section_count)

            r_generated_questions = GetQuestionRequest(knowledge_base_id=kb.id, data_source='api.gpt-35-turbo')
            generated_question_count = get_question_count(h, r_generated_questions)
            kb_generated_question_counts.append(generated_question_count)

            r_gpt_questions = GetQuestionRequest(knowledge_base_id=kb.id, data_source='api.gpt-35-turbo')
            gpt_question_count = get_question_count(h, r_gpt_questions)
            kb_gpt_question_counts.append(gpt_question_count)

        print(f"   KB {kb.id} - {kb.language} - {section_count} sections - {generated_question_count} generated questions - {gpt_question_count} gpt questions")  # noqa: E501

In [None]:
import csv
data = zip(kb_account_ids,  kb_langs, kb_industries, kb_ids, kb_section_counts, kb_generated_question_counts, kb_gpt_question_counts)
# save to csv
with open('knowledge_bases_full.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['account_id', 'language', 'industry', 'knowledge_base_id', 'section_count', 'generated_question_count', 'gpt_question_count'])
    for row in data:
        writer.writerow(row)

In [None]:
# Create a nicely formatted table, with kb_account_ids, kb_ids, kb_langs, kb_industries, kb_section_counts, kb_question_counts, where
# there is color coding for the section and question counts (independent of each other) based on the number of sections and questions
# use some default color scheme for this
import pandas as pd
df = pd.read_csv('knowledge_bases_full.csv')
df.to_html('knowledge_bases.html', escape=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Generate toy data



# Convert your data to a pandas DataFrame
# Instead o loading from lists, load directly from the csv

# Drop rows with NaN values in industry field
df = df.dropna()

# Group by Industries and Langs, and calculate the sum of Section_Counts and Question_Counts
grouped = df[df['language'].isin(['da'])].groupby(['industry', 'language']).sum()[['section_count', 'generated_question_count']].reset_index()



# Get unique industries and languages
unique_industries = grouped['industry'].unique()
unique_langs = grouped['language'].unique()

# Define the width of the bars and the positions of the bars on the x-axis
bar_width = 0.15
r1 = np.arange(len(unique_industries))

# Create a new figure with a specified size
plt.figure(figsize=(10, 6))

# For each language, create bars for section counts and question counts
for i, lang in enumerate(unique_langs):
    data_by_lang = grouped.loc[grouped['language'] == lang]
    for j, count_type in enumerate(['generated_question_count']):
        r = np.array([x + bar_width*(i+j) for x in r1])
        counts = data_by_lang[count_type]
        r = r[np.isin(unique_industries, data_by_lang['industry'])]
        plt.bar(r, counts, width=bar_width, edgecolor='grey', label=f'{count_type} ({lang})')

# Add xticks on the middle of the group bars
plt.xlabel('industry', fontweight='bold')
plt.xticks([r + bar_width*1.5 for r in range(len(unique_industries))], unique_industries, rotation=45)
# plt.ylim(0, 5_000)
# Create legend & Show graphic
plt.legend()
plt.show()

# Plot accounts by industries and plot accounts by number of kb_ids

In [None]:
# response_questions is a list of dictionaries.
# Save to .jsonl file
import json
import random
import math

df = df.dropna()
danish_df = df[df['language'].isin(['da'])]
total_industry_sizes = {unique_industries[i]: counts[i] for i in range(len(unique_industries))}

questions_per_industry = 5000

for index, row in danish_df[['industry', 'generated_question_count', 'knowledge_base_id']].iterrows():
    (industry, generated_question_count, kb_id) = row
    print(row)
    r_questions = GetQuestionRequest(knowledge_base_id=kb_id, label_method_type='generated')
    questions = get_questions(h, r_questions)
    industry_size = total_industry_sizes[industry]
    proportion = generated_question_count / industry_size
    proportional_question_count = math.floor(proportion * questions_per_industry)
    chosen_questions = random.sample(questions, proportional_question_count)
    # print(proportional_question_count)
    corresponding_sections_ids = [question.section_id for question in chosen_questions if question.section_id is not None]
    # print(corresponding_sections_ids)
    r_sections = GetSectionRequest(knowledge_base_id=kb_id, ids=corresponding_sections_ids)
    corresponding_sections= get_sections(h, r_sections)

    with open('generated_questions_2.jsonl', 'a') as f:
        for question in chosen_questions:
            f.write(question.model_dump_json() + '\n')
    with open('corresponding_sections_2.jsonl', 'a') as f:
        for section in corresponding_sections:
            f.write(section.model_dump_json() + '\n')
    print(f"{index} / {len(danish_df)} Industry {industry} - {generated_question_count} questions - {proportional_question_count}")
    
# print(os.path.getsize('generated_questions.jsonl') / (1024*1024))

In [None]:
row = ('Finance', 5796, 293)
(industry, generated_question_count, kb_id) = row
r_questions = GetQuestionRequest(knowledge_base_id=kb_id, label_method_type='generated')
questions = get_questions(h, r_questions)
industry_size = total_industry_sizes[industry]
proportion = generated_question_count / industry_size
proportional_question_count = math.floor(proportion * generated_question_count)
chosen_questions = random.sample(questions, proportional_question_count)

corresponding_sections_ids = [question.section_id for question in chosen_questions]
r_sections = GetSectionRequest(knowledge_base_id=kb_id, ids=corresponding_sections_ids)
corresponding_sections= get_sections(h, r_sections)

with open('generated_questions_2.jsonl', 'a') as f:
    for question in chosen_questions:
        f.write(json.dumps(question.model_dump_json()) + '\n')
with open('corresponding_sections_2.jsonl', 'a') as f:
    for section in corresponding_sections:
        f.write(json.dumps(section.model_dump_json()) + '\n')
print(f"{index} Industry {industry} - {generated_question_count} questions - {proportional_question_count}")

In [None]:
r_questions = GetQuestionRequest(knowledge_base_id=350, label_method_type='generated')
questions = get_questions(h, r_questions)

In [None]:

r_questions = GetQuestionRequest(knowledge_base_id=350, label_method_type='generated')
response_questions = get_questions(h, r_questions)

In [None]:
r_sections = GetSectionRequest(knowledge_base_id=350, id=89805528)
section_count = get_sections(h, r_sections)
section_count[0]

In [None]:
from datasets import Dataset

questions_ds = Dataset.from_json('generated_questions_2.jsonl')
sections_ds = Dataset.from_json('corresponding_sections_2.jsonl')

# 

In [None]:
# save both datasets:
questions_ds.save_to_disk('generated_questions')
sections_ds.save_to_disk('corresponding_sections')