In [1]:
import pandas as pd
import numpy as np
from gliner import GLiNER
import os
import sys 
cwd = os.getcwd()

# Add the '../scripts' directory to the system path
sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../scripts')))
sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../src')))
sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../src/data_chunker')))

# Initialize GLiNER with the base model
model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")


  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [03:10<00:00, 47.55s/it]


In [75]:
df_subset = pd.read_csv('../../assets/csv/data_subset.csv', index_col=0)
df_subset.head()
#
text = df_subset['text'].iloc[4]
text

'The formal basis of doctoral studies is constituted by theDoctoral degree regulations of the School of Business and Economics (May 21, 2010). To register for doctoral studies at the School, please follow the following steps: Step 1: Supervision agreement If you are pursuing a structured doctoral degree programme, you must conclude a written supervision agreement once you have been successfully admitted to the programme by the relevant office. If you are pursuing an individual doctoral degree programme, you must conclude the supervision agreement with your supervisor directly. Step 2: Applying for admission to doctoral studies After concluding your supervision agreement, you must submit an application for admission to doctoral studies to the Graduate Office of the School of Business and Economics. The following documents are required: application for admission to doctoral studies copy of the supervision agreement an up-to-date CV copies of all university degrees (bachelor\'s and master

In [76]:
#check text column and substitute all ' [at] ' with '@'
df_subset['text'] = df_subset['text'].str.replace('(at)', '@')
text = df_subset['text'].iloc[134]
text

"Postal Address:Institute of Accounting and AuditingSchool of Business and EconomicsHumboldt-Universität zu BerlinUnter den Linden 6, 10099 Berlin Office: Dorotheenstraße 1, 10117 Berlin[How to find us] Our professors:Prof. Dr. Hanne BöckemProf. Dr. Ulf BrüggemannProf. Dr. Joachim Gassen Phone: +49 30 2093-99440 Fax: +49 30 2093-99441 E-Mail: wpruefung@wiwi.hu-berlin.de The TRR 266 Accounting for Transparency is a trans-regional Collaborative Research Center funded by the German Research Foundation (Deutsche Forschungsgemeinschaft – DFG). Our team of more than 100 dedicated researchers examines how accounting and taxation affect firm and regulatory transparency and how regulation and transparency impact our economy and society. We intend to help develop effective regulation for firm transparency and a transparent tax system. Naturally, we also ensure transparency of our own research. Aktuelles Latest News: 01.07.2024 - Seminare im Wintersemester 2024/25: Bewerbungsfristen / Seminars fo

In [77]:
# Define labels to look for
labels = ["person", "course", "date", "research_paper", "research_project", "teams", "city", "address", "organisation", "phone_number", "url", "other"]

# Predict entities in the text using the model
entities = model.predict_entities(text, labels)

# Print the number of entities found
print(f"Number of entities before deduplication: {len(entities)}")

# Remove duplicates by considering both 'text' and 'label'
unique_entities = { (entity['text'], entity['label']): entity for entity in entities }.values()

# Convert back to a list of dictionaries
unique_entities = list(unique_entities)

# Print the number of entities after deduplication
print(f"Number of unique entities after deduplication: {len(unique_entities)}")

# Optional: If you want to print out the unique entities
# for entity in unique_entities:
#     print(f"{entity['text']} => {entity['label']}")

unique_entities



Number of entities before deduplication: 15
Number of unique entities after deduplication: 14


[{'start': 107,
  'end': 118,
  'text': 'BerlinUnter',
  'label': 'city',
  'score': 0.7392323017120361},
 {'start': 179,
  'end': 185,
  'text': 'Berlin',
  'label': 'city',
  'score': 0.9862346053123474},
 {'start': 154,
  'end': 171,
  'text': 'Dorotheenstraße 1',
  'label': 'address',
  'score': 0.8000809550285339},
 {'start': 273,
  'end': 287,
  'text': 'Joachim Gassen',
  'label': 'person',
  'score': 0.5068864822387695},
 {'start': 295,
  'end': 312,
  'text': '+49 30 2093-99440',
  'label': 'phone_number',
  'score': 0.6214548349380493},
 {'start': 318,
  'end': 335,
  'text': '+49 30 2093-99441',
  'label': 'phone_number',
  'score': 0.5376328825950623},
 {'start': 476,
  'end': 502,
  'text': 'German Research Foundation',
  'label': 'organisation',
  'score': 0.7187440395355225},
 {'start': 504,
  'end': 535,
  'text': 'Deutsche Forschungsgemeinschaft',
  'label': 'organisation',
  'score': 0.5554184913635254},
 {'start': 538,
  'end': 541,
  'text': 'DFG',
  'label': 'organ

In [78]:
unique_entities = { (entity['text'], entity['label']): entity for entity in entities }.values()
unique_entities = list(unique_entities)


from collections import defaultdict

# Create a defaultdict to handle multiple entities with the same label
label_name_dict = defaultdict(list)

for entity in unique_entities:
    label_name_dict[entity['label']].append(entity['text'])

# Convert defaultdict to a regular dict if necessary
label_name_dict = dict(label_name_dict)

# Print the result to verify
label_name_dict

{'city': ['BerlinUnter', 'Berlin'],
 'address': ['Dorotheenstraße 1'],
 'person': ['Joachim Gassen'],
 'phone_number': ['+49 30 2093-99440', '+49 30 2093-99441'],
 'organisation': ['German Research Foundation',
  'Deutsche Forschungsgemeinschaft',
  'DFG',
  'Institut für Rechnungswesen und Wirtschaftsprüfung',
  'Institute of Accounting and Auditing'],
 'course': ['Bachelorseminar (Deutsch)',
  'Accounting (Englisch)',
  'Bachelorseminar']}

In [None]:

from semantic_chunker import *
process_data_semantic(df_subset, model, labels)

In [8]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from langchain_community.retrievers import (
    PineconeHybridSearchRetriever,
)

bm25_encoder = BM25Okapi()

embeddings = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
pc = Pinecone("aea87181-4025-48d7-b46c-25e97349a06d")
index = pc.Index('all-mini-char-dim384-chunk512')


retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

TypeError: BM25Okapi.__init__() missing 1 required positional argument: 'corpus'

In [5]:

# Import necessary libraries
import os
import json
import pinecone
import pandas as pd
from typing import List, Dict, Any, Tuple, Callable
from dotenv import load_dotenv
from pinecone import Pinecone
import time
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

# Load environment variables
load_dotenv()

# Configuration for different API keys and their default embedding models
API_CONFIGS = {
    "all-mini-dotproduct": {
        "api_key": 'b4f8e710-fc88-4b6a-a2b8-433f60359b10',
        "default_embedding_model": "all-MiniLM-L6-v2"
    }

    
}
# Utility functions

def load_bm25_values(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)
    
def convert_entities_to_label_name_dict(entities: List[Dict[str, Any]]) -> Dict[str, List[str]]:
    """
    Converts a list of entity dictionaries into a dictionary where each label has a list of entity texts.
    """
    label_name_dict = defaultdict(set)
    for entity in entities:
        text = entity['text'].strip().lower()
        label_name_dict[entity['label']].add(text)
    return {k: list(v) for k, v in label_name_dict.items()}

def get_embedding_model(model_name: str) -> SentenceTransformer:
    if model_name == "Snowflake/snowflake-arctic-embed-l":
        return SentenceTransformer(model_name, trust_remote_code=True)
    else:
        return SentenceTransformer(model_name)

def convert_question_to_vector(embed_model: Any, query: str) -> List[float]:
    if hasattr(embed_model, 'encode'):
        return embed_model.encode(query).tolist()
    else:
        raise AttributeError("The provided model doesn't have an 'encode' method.")

def calculate_mrr(question_id: str, general_ids: List[str]) -> Tuple[int, float]:
    if question_id in general_ids:
        rank = general_ids.index(question_id) + 1
        reciprocal_rank = 1 / rank
    else:
        rank = 0
        reciprocal_rank = 0
    return rank, reciprocal_rank

def calculate_hit_at_k(question_id: str, general_ids: List[str], k: int) -> int:
    return int(question_id in general_ids[:k])

def hybrid_scale(dense, sparse, alpha: float):
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse

def generate_sparse_vector(query: str, bm25_values: dict) -> Dict[str, Any]:
    query_terms = query.lower().split()
    vector = {}
    for i, term in enumerate(bm25_values['vocabulary']):
        if term in query_terms:
            vector[i] = bm25_values['idf'].get(term, 0) * (query_terms.count(term) * (bm25_values['k1'] + 1)) / (
                query_terms.count(term) + bm25_values['k1'] * (1 - bm25_values['b'] + bm25_values['b'] * len(query_terms) / bm25_values['avgdl'])
            )
    return {"indices": list(vector.keys()), "values": list(vector.values())}

# PineconeWrapper class

class PineconeWrapper:
    def __init__(self, index):
        self.index = index

    def hybrid_search_with_metadata(self, dense_vec, sparse_vec, filter_metadata, k):
        if not sparse_vec['indices']:
            sparse_vec['indices'] = [0]
            sparse_vec['values'] = [0.0]
        
        try:
            results = self.index.query(
                vector=dense_vec,
                sparse_vector=sparse_vec,
                filter=filter_metadata,
                top_k=k,
                include_metadata=True
            )
            return [(type('obj', (), {'metadata': item.metadata})(), item.score) for item in results.matches]
        except Exception as e:
            print(f"Error in hybrid_search_with_metadata: {str(e)}")
            print(f"Dense vector: {dense_vec[:5]}... (length: {len(dense_vec)})")
            print(f"Sparse vector indices: {sparse_vec['indices']}")
            print(f"Sparse vector values: {sparse_vec['values']}")
            print(f"Filter metadata: {filter_metadata}")
            raise

# Metadata filtering function

def create_metadata_filter(entities: Dict[str, List[str]]) -> Dict[str, Dict[str, List[str]]]:
    filter_metadata = {}
    for entity_type, entity_values in entities.items():
        if entity_values:
            filter_metadata[f"entities.{entity_type}"] = {"$in": entity_values}
    return filter_metadata

# Evaluation function
def evaluate_retriever_with_ner_and_metadata(qa_df: pd.DataFrame, docsearch: Any, convert_question_to_vector: Callable[[str], List[float]], 
                                             bm25_values: dict, k_values: List[int] = [1, 3, 5], alpha: float = 0.5) -> Dict[str, Any]:
    total_mrr = 0
    total_retrieval_time = 0
    hit_at_k = {k: 0 for k in k_values}
    entity_hit_at_k = defaultdict(lambda: {k: 0 for k in k_values})
    num_questions = len(qa_df)

    start_time = time.time()
    for i, (_, row) in enumerate(qa_df.iterrows(), 1):
        if i % 10 == 0:  # Print progress every 10 questions
            print(f"Processing question {i}/{num_questions}")
        
        question = row['question']
        question_id = row['id']
        question_entities = row.get('entities', {})

        try:
            query_start_time = time.time()
            dense_vec = convert_question_to_vector(question)
            sparse_vec = generate_sparse_vector(question, bm25_values)
            dense_vec, sparse_vec = hybrid_scale(dense_vec, sparse_vec, alpha)
            
            filter_metadata = create_metadata_filter(question_entities)
            search_results = docsearch.hybrid_search_with_metadata(dense_vec, sparse_vec, filter_metadata, k=max(k_values))
            
            query_end_time = time.time()
            retrieval_time = query_end_time - query_start_time
            total_retrieval_time += retrieval_time

            general_ids = [item[0].metadata['general_id'] for item in search_results]
            _, reciprocal_rank = calculate_mrr(question_id, general_ids)
            total_mrr += reciprocal_rank

            for k in k_values:
                hit = calculate_hit_at_k(question_id, general_ids, k)
                hit_at_k[k] += hit

                # Entity-specific evaluation
                if hit:
                    retrieved_entities = defaultdict(set)
                    for item in search_results[:k]:
                        chunk_entities = item[0].metadata.get('entities', {})
                        for entity_type, entities in chunk_entities.items():
                            retrieved_entities[entity_type].update(entities)
                    
                    for entity_type, entities in question_entities.items():
                        if any(entity in retrieved_entities.get(entity_type, set()) for entity in entities):
                            entity_hit_at_k[entity_type][k] += 1

        except Exception as e:
            print(f"Error processing question '{question}': {str(e)}")
            continue

    end_time = time.time()
    total_time = end_time - start_time
    print(f"Total evaluation time: {total_time:.2f} seconds")

    avg_mrr = total_mrr / num_questions
    avg_retrieval_time = total_retrieval_time / num_questions
    avg_hit_at_k = {k: hits / num_questions for k, hits in hit_at_k.items()}
    avg_entity_hit_at_k = {
        entity_type: {k: hits / num_questions for k, hits in k_hits.items()}
        for entity_type, k_hits in entity_hit_at_k.items()
    }

    return {
        "MRR": avg_mrr,
        "Avg Retrieval Time": avg_retrieval_time,
        "HitatK": avg_hit_at_k,
        "EntityHitatK": avg_entity_hit_at_k
    }
# Main execution function

def test_pinecone_index(qa_df: pd.DataFrame, api_key_name: str, index_name: str, bm25_values: dict, 
                        k_values: List[int] = [1, 3, 5], alpha_values: List[float] = [0, 0.2, 0.5, 0.8, 1]) -> Dict[str, Any]:
    results = {}
    
    config = API_CONFIGS[api_key_name]
    api_key = config['api_key']
    default_embedding_model = config['default_embedding_model']

    print(f"\nEvaluating index: {index_name}")

    try:
        pc = Pinecone(api_key=api_key)
        index = pc.Index(index_name)
        embed_model = get_embedding_model(default_embedding_model)
        docsearch = PineconeWrapper(index)

        def local_convert_question_to_vector(query: str) -> List[float]:
            return convert_question_to_vector(embed_model, query)

        for alpha in alpha_values:
            print(f"Evaluating with alpha = {alpha}")
            index_results = evaluate_retriever_with_ner_and_metadata(qa_df, docsearch, local_convert_question_to_vector, bm25_values, k_values, alpha)
            result_key = f"{api_key_name}_{index_name}_alpha{alpha}"
            results[result_key] = index_results

            print(f"Average MRR: {index_results['MRR']:.4f}")
            print(f"Average Retrieval Time: {index_results['Avg Retrieval Time']:.4f} seconds")
            for k, hit_rate in index_results['HitatK'].items():
                print(f"Hit@{k}: {hit_rate:.4f}")
            for entity_type, hits in index_results['EntityHitatK'].items():
                print(f"Entity type: {entity_type}")
                for k, hit_rate in hits.items():
                    print(f"  Hit@{k}: {hit_rate:.4f}")

    except Exception as e:
        print(f"Error occurred while evaluating index '{index_name}': {str(e)}")
        import traceback
        traceback.print_exc()

    return results

# Load and preprocess data
qa_df = pd.read_csv('/Users/s.konchakova/Thesis/assets/csv/qa_df.csv', index_col=0)

# Perform GLiNER NER on questions if not already done
if 'entities' not in qa_df.columns:
    from gliner import GLiNER
    ner_model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")
    labels = ["person", "course", "date", "research_paper", "research_project", "teams", "city", "address", "organisation", "phone_number", "url", "other"]
    qa_df['entities'] = qa_df['question'].apply(lambda x: convert_entities_to_label_name_dict(ner_model.predict_entities(x, labels)))

# Load BM25 values
bm25_values = load_bm25_values('/Users/s.konchakova/Thesis/assets/bm25_values.json')  # Update this path

# Test the function with a specific index
api_key_name = "all-mini-dotproduct"  # Choose the API key you want to use
index_name = "all-mini-recursive-dim384-b512"  # Replace with the actual index name you want to test
alpha_values = [0.2, 0.5, 0.8]
results = test_pinecone_index(qa_df, api_key_name, index_name, bm25_values, alpha_values=alpha_values)

# Display results
for result_key, result in results.items():
    print(f"\nResults for {result_key}:")
    print(f"MRR: {result['MRR']:.4f}")
    print(f"Average Retrieval Time: {result['Avg Retrieval Time']:.4f} seconds")
    for k, hit_rate in result['HitatK'].items():
        print(f"Hit@{k}: {hit_rate:.4f}")
    for entity_type, hits in result['EntityHitatK'].items():
        print(f"Entity type: {entity_type}")
        for k, hit_rate in hits.items():
            print(f"  Hit@{k}: {hit_rate:.4f}")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 32513.98it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Evaluating index: all-mini-recursive-dim384-b512
Evaluating with alpha = 0.2
Processing question 10/22
Processing question 20/22
Total evaluation time: 5.78 seconds
Average MRR: 0.0455
Average Retrieval Time: 0.2626 seconds
Hit@1: 0.0455
Hit@3: 0.0455
Hit@5: 0.0455
Evaluating with alpha = 0.5
Processing question 10/22
Processing question 20/22
Total evaluation time: 5.14 seconds
Average MRR: 0.0455
Average Retrieval Time: 0.2332 seconds
Hit@1: 0.0455
Hit@3: 0.0455
Hit@5: 0.0455
Evaluating with alpha = 0.8
Processing question 10/22
Processing question 20/22
Total evaluation time: 4.02 seconds
Average MRR: 0.0455
Average Retrieval Time: 0.1824 seconds
Hit@1: 0.0455
Hit@3: 0.0455
Hit@5: 0.0455

Results for all-mini-dotproduct_all-mini-recursive-dim384-b512_alpha0.2:
MRR: 0.0455
Average Retrieval Time: 0.2626 seconds
Hit@1: 0.0455
Hit@3: 0.0455
Hit@5: 0.0455

Results for all-mini-dotproduct_all-mini-recursive-dim384-b512_alpha0.5:
MRR: 0.0455
Average Retrieval Time: 0.2332 seconds
Hit@1: 

In [20]:

def create_complex_filter(entities: Dict[str, List[str]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    Create a complex filter dictionary based on the entities.
    
    Args:
    entities (Dict[str, List[str]]): A dictionary of entity types and their values.
    
    Returns:
    Dict[str, List[Dict[str, Any]]]: A complex filter dictionary for Pinecone query.
    """
    filter_conditions = []
    for entity_type, entity_values in entities.items():
        if entity_values:  # Only add non-empty entity lists to the filter
            filter_conditions.append({f"{entity_type}": {"$in": entity_values}})
    
    # Combine all conditions with $and
    if len(filter_conditions) > 1:
        return {"$or": filter_conditions}
    elif len(filter_conditions) == 1:
        return filter_conditions[0]
    else:
        return {}  # Return an empty filter if there are no conditio

create_complex_filter(qa_df['entities'].iloc[1])


pc = Pinecone(api_key='b4f8e710-fc88-4b6a-a2b8-433f60359b10')
index = pc.Index(index_name)
embeddings  = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)

vector = convert_question_to_vector(embeddings, qa_df['question'].iloc[1])
sparse_vector = generate_sparse_vector(qa_df['question'].iloc[1], bm25_values)


index.query(top_k=5, vector=vector, sparse_vector=sparse_vector, include_metadata=True,
            filter = create_complex_filter(qa_df['entities'].iloc[1]))

{'matches': [{'id': '0b69687b08babda49107b9544d98cd4b_1',
              'metadata': {'chunk_size': 512.0,
                           'course': ['sfb-tr15',
                                      'the college portfolio problem'],
                           'date': ['november 6', 'october 16'],
                           'doc_type': 'recursive',
                           'general_id': '0b69687b08babda49107b9544d98cd4b',
                           'organisation': ['brown', 'boston u', 'penn state'],
                           'person': ['juan ortner',
                                      'roberto serrano',
                                      'nageeb ali',
                                      'sophie kreutzkamp'],
                           'project': 'all-mini-eucl',
                           'text': 'Up to 2015, the seminar series was part of '
                                   'the SFB-TR15 "Governance and Efficiency of '
                                   'Economic Systems" and w

In [21]:
qa_df

Unnamed: 0,id,question,answer,entities
2,b4d5781947103d717a948ec391fedd98,What are the key concepts and ideas presented ...,"The text ""nan"" presents the concept of nanotec...",{}
3,0b69687b08babda49107b9544d98cd4b,"Who will be speaking on October 16, 10:30 am a...",Nageeb Ali from Penn State will be speaking.,"{'date': ['october 16'], 'course': ['sfb-tr 15..."
5,3e78584e1eb99af377d3b48e0a394cc9,What is the prerequisite for Advanced Macroeco...,The prerequisite is Advanced Macroeconomic Ana...,{'course': ['advanced macroeconomic analysis i...
6,66f155e809a45f493153161960ac6fd4,What is the purpose of the Dr. Anja Schwerk co...,The purpose of the Dr. Anja Schwerk consultati...,{'person': ['dr. anja schwerk']}
9,c9087729e489b1a397ea50408d52818a,What are the main goals of the French-German c...,The main goals of the French-German cooperatio...,{'research_project': ['french-german cooperati...
12,b47a06290f4c8ad9e20c2bb34fee3066,"What is the new R package ""tram"" developed by ...","The text states that the new R package ""tram: ...","{'person': ['b. ripley', 'n. klein', 't. hotho..."
13,338e7b59446adeca6fc00d4bf6c2dae2,Who won the Research Prize of the School of Bu...,Roland Strausz,"{'person': ['who'], 'research_project': ['rese..."
16,399ab1af3f0fc0e82785ff94c917e249,What is Christopher Gerling's research focus?,His current research focus is on unstructured ...,{'person': ['christopher gerling']}
18,4b77cc04867ae67cf1dd2abb24783686,What kind of scholarship do Erasmus participan...,Participants of the Erasmus programme receive ...,{'organisation': ['erasmus']}
21,3661c6803d347e8d73af15a084bedd1b,What is required for a doctoral student to reg...,"To register for doctoral studies, a doctoral s...","{'person': ['doctoral student'], 'organisation..."


In [19]:

def create_complex_filter(entities: Dict[str, List[str]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    Create a complex filter dictionary based on the entities.
    
    Args:
    entities (Dict[str, List[str]]): A dictionary of entity types and their values.
    
    Returns:
    Dict[str, List[Dict[str, Any]]]: A complex filter dictionary for Pinecone query.
    """
    filter_conditions = []
    for entity_type, entity_values in entities.items():
        if entity_values:  # Only add non-empty entity lists to the filter
            filter_conditions.append({f"{entity_type}": {"$in": entity_values}})
    
    # Combine all conditions with $and
    if len(filter_conditions) > 1:
        return {"$or": filter_conditions}
    elif len(filter_conditions) == 1:
        return filter_conditions[0]
    else:
        return {}  # Return an empty filter if there are no conditio

create_complex_filter(qa_df['entities'].iloc[1])

{'$or': [{'date': {'$in': ['october 16']}},
  {'course': {'$in': ['sfb-tr 15 seminar']}}]}

In [12]:
qa_df['entities'].iloc[1].keys()

dict_keys(['date', 'course'])

In [6]:
qa_df

Unnamed: 0,id,question,answer,entities
2,b4d5781947103d717a948ec391fedd98,What are the key concepts and ideas presented ...,"The text ""nan"" presents the concept of nanotec...",{}
3,0b69687b08babda49107b9544d98cd4b,"Who will be speaking on October 16, 10:30 am a...",Nageeb Ali from Penn State will be speaking.,"{'date': ['october 16'], 'course': ['sfb-tr 15..."
5,3e78584e1eb99af377d3b48e0a394cc9,What is the prerequisite for Advanced Macroeco...,The prerequisite is Advanced Macroeconomic Ana...,{'course': ['advanced macroeconomic analysis i...
6,66f155e809a45f493153161960ac6fd4,What is the purpose of the Dr. Anja Schwerk co...,The purpose of the Dr. Anja Schwerk consultati...,{'person': ['dr. anja schwerk']}
9,c9087729e489b1a397ea50408d52818a,What are the main goals of the French-German c...,The main goals of the French-German cooperatio...,{'research_project': ['french-german cooperati...
12,b47a06290f4c8ad9e20c2bb34fee3066,"What is the new R package ""tram"" developed by ...","The text states that the new R package ""tram: ...","{'person': ['b. ripley', 'n. klein', 't. hotho..."
13,338e7b59446adeca6fc00d4bf6c2dae2,Who won the Research Prize of the School of Bu...,Roland Strausz,"{'person': ['who'], 'research_project': ['rese..."
16,399ab1af3f0fc0e82785ff94c917e249,What is Christopher Gerling's research focus?,His current research focus is on unstructured ...,{'person': ['christopher gerling']}
18,4b77cc04867ae67cf1dd2abb24783686,What kind of scholarship do Erasmus participan...,Participants of the Erasmus programme receive ...,{'organisation': ['erasmus']}
21,3661c6803d347e8d73af15a084bedd1b,What is required for a doctoral student to reg...,"To register for doctoral studies, a doctoral s...","{'person': ['doctoral student'], 'organisation..."
