# INTERACTIVE HUMAN IN THE LOOP STORYTELLING

## Data Preprocessing

In [1]:
import pandas as pd

dataset_path = 'datasets\computer_science_synthetic_dataset.csv'
data = pd.read_csv(dataset_path)

print(data.head())

   id                                              input  \
0   1  what are the different ways to input data into...   
1   2  Can you please tell me about the basics of cre...   
2   3      how do i get started in android development ?   
3   4  What should I learn next after learning about ...   
4   5  What are the basic concepts of computer progra...   

                                              output  
0  There are several ways to input data into a pr...  
1  Binary search tree (BST) is a data structure t...  
2  Android development is a popular field in the ...  
3  You should learn about operating systems and p...  
4  Computer programming is a way of communicating...  


## Segement Sentences

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re

In [3]:
# download nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ange2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import re
import faiss
from sentence_transformers import SentenceTransformer

# load dataset
def load_data(file_path):
    """Load Dataset"""
    df = pd.read_csv(file_path)
    return df

# clean and normalize text
def clean_text(text):
    """Clean and normalize text."""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = re.sub(r'[^\w\s\.\,\']', '', text)
    return text

# process dataset (input/output pairs)
def process_rows(df):
    """Clean and return list of {'id', 'input', 'output'} dicts."""
    processed = []
    for _, row in df.iterrows():
        item_id = row['id']
        input_text = clean_text(row['input'])
        output_text = clean_text(row['output'])
        processed.append({
            'ID': item_id,
            'Input': input_text,
            'Output': output_text
        })
    return processed

# generate embeddings for input and output
def generate_embeddings(processed, model_name='all-MiniLM-L6-v2'):
    """Generate sentence embeddings using Sentence Transformers."""
    model = SentenceTransformer(model_name)
    embeddings = []
    metadata = []

    for item in processed:
        for io_type, text in [('input', item['Input']), ('output', item['Output'])]:
            vector = model.encode(text, convert_to_tensor=True).cpu().numpy()
            embeddings.append(vector)
            metadata.append({
                'ID': item['ID'],
                'Type': io_type,
                'Text': text
            })

    embeddings = np.array(embeddings)
    return embeddings, metadata

# build FAISS index
def build_faiss_index(embeddings):
    """Build and store FAISS index for retrieval."""
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# save metadata + index
def save_preprocessed_data(metadata, index, index_file='faiss_index'):
    """Save metadata and FAISS index."""
    pd.DataFrame(metadata).to_csv('metadata.csv', index=False)
    faiss.write_index(index, index_file)

# main preprocessing pipeline
def preprocess_pipeline(file_path):
    """Complete preprocessing pipeline for dataset."""
    print("Loading data...")
    df = load_data(file_path)

    print("Cleaning rows...")
    processed = process_rows(df)

    print("Generating embeddings...")
    embeddings, metadata = generate_embeddings(processed)

    print("Building FAISS index...")
    index = build_faiss_index(embeddings)

    print("Saving preprocessed data...")
    save_preprocessed_data(metadata, index)

    print("Done!")

if __name__ == '__main__':
    file_path = 'datasets/computer_science_synthetic_dataset.csv'
    preprocess_pipeline(file_path)


Loading data...
Cleaning rows...
Generating embeddings...
Building FAISS index...
Saving preprocessed data...
Done!


In [5]:
processed_data = pd.read_csv('metadata.csv')
processed_data.head()

Unnamed: 0,ID,Type,Text
0,1,input,what are the different ways to input data into...
1,1,output,There are several ways to input data into a pr...
2,2,input,Can you please tell me about the basics of cre...
3,2,output,Binary search tree BST is a data structure tha...
4,3,input,how do i get started in android development


In [8]:
# load the FAISS index
index = faiss.read_index('faiss_index')

# load metadata
metadata = pd.read_csv('metadata.csv')

In [9]:
from sentence_transformers import SentenceTransformer

# load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# example query
query = "How do i set up an environment?"

# generate query embedding
query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy()

In [10]:
# number of closest matches to retrieve
top_k = 10

# search FAISS
distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

# loop through results
for i, idx in enumerate(indices[0]):
    row = metadata.iloc[idx]

    print(f"Result {i + 1}:")
    print(f"ID: {row['ID']}")
    print(f"Type: {row['Type']}")   # 'input' or 'output'
    print(f"Text: {row['Text']}")
    print(f"Distance: {distances[0][i]}")
    print()

Result 1:
ID: 19799
Type: input
Text: How do I make an application
Distance: 0.9935182332992554

Result 2:
ID: 15929
Type: input
Text: What are the steps to make a user interface
Distance: 1.0579031705856323

Result 3:
ID: 1193
Type: input
Text: What are the steps to create a desktop application
Distance: 1.0921013355255127

Result 4:
ID: 4507
Type: input
Text: How do I setup Linux
Distance: 1.0965988636016846

Result 5:
ID: 5035
Type: input
Text: what are the basic steps to set up a web host and a website
Distance: 1.1057419776916504

Result 6:
ID: 23782
Type: input
Text: What is an integrated development environment
Distance: 1.125197172164917

Result 7:
ID: 2657
Type: input
Text: What are the basic steps to creating a computer program
Distance: 1.1306798458099365

Result 8:
ID: 29176
Type: input
Text: How can one get started with installing a virtual environment for python on a macbook
Distance: 1.1330370903015137

Result 9:
ID: 7630
Type: input
Text: How do I start building web app