# INTERACTIVE HUMAN IN THE LOOP STORYTELLING

## Data Preprocessing

In [7]:
import pandas as pd

dataset_path = 'datasets\ROCStories_winter2017.csv'
data = pd.read_csv(dataset_path)

print(data.head())

                                storyid               storytitle  \
0  8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd   David Drops the Weight   
1  0beabab2-fb49-460e-a6e6-f35a202e3348              Frustration   
2  87da1a22-df0b-410c-b186-439700b70ba6       Marcus Buys Khakis   
3  2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9       Different Opinions   
4  c71bb23b-7731-4233-8298-76ba6886cee1  Overcoming shortcomings   

                                           sentence1  \
0  David noticed he had put on a lot of weight re...   
1                       Tom had a very short temper.   
2  Marcus needed clothing for a business casual e...   
3  Bobby thought Bill should buy a trailer and ha...   
4          John was a pastor with a very bad memory.   

                                           sentence2  \
0  He examined his habits to try and figure out t...   
1               One day a guest made him very angry.   
2  All of his clothes were either too formal or t...   
3  Bill thought a truck would 

## Segement Sentences

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re

In [None]:
# download nltk
nltk.download('punkt')

In [11]:
# load dataset
def load_data(file_path):
    """Load ROCStories dataset."""
    df = pd.read_csv(file_path)
    return df

# clean and normalize text
def clean_text(text):
    """Clean and normalize text."""
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r'[^\w\s\.\,\']', '', text)  # Remove special characters
    return text

# tokenization and story processing
def process_stories(df):
    """Process stories into a tokenized and cleaned format."""
    processed_stories = []
    for _, row in df.iterrows():
        story_id = row['storyid']
        story_title = clean_text(row['storytitle'])
        sentences = [clean_text(row[f'sentence{i}']) for i in range(1, 6)]
        processed_stories.append({'StoryID': story_id, 'StoryTitle': story_title, 'Sentences': sentences})
    return processed_stories

# generate embeddings
def generate_embeddings(processed_stories, model_name='all-MiniLM-L6-v2'):
    """Generate sentence embeddings using Sentence Transformers."""
    model = SentenceTransformer(model_name)
    embeddings = []
    metadata = []

    for story in processed_stories:
        story_id = story['StoryID']
        story_title = story['StoryTitle']
        sentences = story['Sentences']
        for i, sentence in enumerate(sentences):
            embedding = model.encode(sentence, convert_to_tensor=True).cpu().numpy()
            embeddings.append(embedding)
            metadata.append({
                'StoryID': story_id,
                'StoryTitle': story_title,
                'SentenceIndex': i,
                'Sentence': sentence
            })

    embeddings = np.array(embeddings)
    return embeddings, metadata

# store embeddings in FAISS
def build_faiss_index(embeddings):
    """Build and store FAISS index for retrieval."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# save preprocessed data
def save_preprocessed_data(metadata, index, index_file='faiss_index'):
    """Save metadata and FAISS index."""
    # save metadata
    pd.DataFrame(metadata).to_csv('metadata.csv', index=False)
    # save FAISS index
    faiss.write_index(index, index_file)

# main preprocessing pipeline
def preprocess_pipeline(file_path):
    """Complete preprocessing pipeline for ROCStories dataset."""
    print("Loading data...")
    df = load_data(file_path)

    print("Processing and cleaning stories...")
    processed_stories = process_stories(df)

    print("Generating embeddings...")
    embeddings, metadata = generate_embeddings(processed_stories)

    print("Building FAISS index...")
    index = build_faiss_index(embeddings)

    print("Saving preprocessed data...")
    save_preprocessed_data(metadata, index, index_file='faiss_index')  # pass FAISS index
    print("Preprocessing complete!")

if __name__ == '__main__':
    file_path = 'datasets\ROCStories_winter2017.csv'
    preprocess_pipeline(file_path)

Loading data...
Processing and cleaning stories...
Generating embeddings...
Building FAISS index...
Saving preprocessed data...
Preprocessing complete!


In [12]:
processed_data = pd.read_csv('metadata.csv')
processed_data.head()

Unnamed: 0,StoryID,StoryTitle,SentenceIndex,Sentence
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,0,David noticed he had put on a lot of weight re...
1,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,1,He examined his habits to try and figure out t...
2,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,2,He realized he'd been eating too much fast foo...
3,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,3,He stopped going to burger places and started ...
4,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,4,"After a few weeks, he started to feel much bet..."


In [27]:
import numpy as np

np.save('embeddings.npy', index)

In [13]:
# load the FAISS index
index = faiss.read_index('faiss_index')

# load metadata
metadata = pd.read_csv('metadata.csv')

In [31]:
from sentence_transformers import SentenceTransformer

# load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# example query
query = "It's so good to be alive"

# generate query embedding
query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy()

In [32]:
# number of closest matches to retrieve
top_k = 10

# search
distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

for i, idx in enumerate(indices[0]):
    print(f"Result {i + 1}:")
    print(f"Sentence: {metadata.iloc[idx]['Sentence']}")
    print(f"Story Title: {metadata.iloc[idx]['StoryTitle']}")
    print(f"Distance: {distances[0][i]}")
    print()

Result 1:
Sentence: Life felt so good, even though it didn't last long.
Story Title: Living my dream
Distance: 0.8068217635154724

Result 2:
Sentence: I began to wish I wasn't alive.
Story Title: Not the Same
Distance: 0.9622210264205933

Result 3:
Sentence: She felt so lucky and grateful to be alive
Story Title: Chemo
Distance: 1.0018548965454102

Result 4:
Sentence: They were luckily they made it alive.
Story Title: Hot air balloon fire
Distance: 1.0196583271026611

Result 5:
Sentence: His life was going great.
Story Title: Best man needed
Distance: 1.040922999382019

Result 6:
Sentence: I'm glad to be safely on the ground and for the trip to be over.
Story Title: Airplane ride
Distance: 1.0543153285980225

Result 7:
Sentence: I feel I am so very blessed in life.
Story Title: Volunteering
Distance: 1.0623960494995117

Result 8:
Sentence: It was alive so I went to save it too.
Story Title: Trip The Cat
Distance: 1.0711764097213745

Result 9:
Sentence: Luckily he was able to bring it t

In [33]:
import openai

# user query
user_query = "Write a story about overcoming challenges."

# retrieve similar sentences
query_embedding = model.encode(user_query, convert_to_tensor=True).cpu().numpy()
distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

# collect retrieved sentences
retrieved_sentences = [metadata.iloc[idx]['Sentence'] for idx in indices[0]]

# combine retrieved context with the query
context = " ".join(retrieved_sentences)
enhanced_prompt = f"{context}\n\nUser's query: {user_query}"

# generate output using GPT-4
response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[{"role": "user", "content": enhanced_prompt}]
)

print(response['choices'][0]['message']['content'])

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
