In [1]:
# script to turn extracted text files into locally stored vector stores

In [2]:
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access environment variables
api_key = os.getenv("OPENAI_KEY")

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle

embeddings = OpenAIEmbeddings(openai_api_key=api_key)

import glob

folder_path = 'raw_data/itinerary_text'
file_list = glob.glob(folder_path + '/*')

for file_path in file_list:

    print(file_path[24:-4])
    # Loading data...
    loader = TextLoader(file_path)
    docs = loader.load()

    # Splitting text...
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,
                                                   chunk_overlap = 50)
    all_splits = text_splitter.split_documents(docs)

    # Creating vectorstore...
    vectorstore = FAISS.from_documents(all_splits, embeddings)
    with open(f"processed/{file_path[24:-4]}_vectorstore.pkl", "wb") as f:
        pickle.dump(vectorstore, f)


Lima to La Paz: Sandboarding & Sunsets Lares Trek
Inca Discovery Inca Trail
The Lares Trek Lares Trek
Jordan Multisport 
Serengeti to Victoria Falls Overland: Night Stars & Spices 
Spain & Portugal: Flamenco & Tapas 
Explore Southern Africa 
Inca Discovery Plus Cusco Stay
Highlights of Jordan 
Highlights of Morocco
Classic Bali 
Iceland Northern Lights & Golden Circle 
Laos: Sunrises & Street Food 
The Lares Trek Cusco Stay
North & Central Vietnam: Hanoi, Hoi An & Countryside Adventures 
Everest Base Camp Trek 
Amazon to the Andes Cusco Stay
Golden Triangle 
Bali: Beaches & Boat Rides 
Egypt & Jordan Adventure 
Lima to Rio: Coast to Coast Inca Trail
Japan Express: Osaka to Tokyo 
The Inca Trail Cusco Stay
Inca Discovery Lares Trek
Lima to La Paz: Sandboarding & Sunsets Cusco Stay
Cambodia to Vietnam: Night Markets & Noodle-Making 
The Many Sides of Mexico: Puerto Vallarta to Oaxaca 
Inca Discovery Cusco Stay
The Inca Trail Lares Trek
Best of South Korea 
Highlights of Portugal 
Laos to

In [11]:
messages = [{'role': 'user', 'content': 'hi'}, {'role': 'assistant', 'content': 'Hello! How can I assist you today?'}, {'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': 'I am an AI language model created by OpenAI, called GPT-3. I am here to provide information, answer questions, and engage in conversation. How can I help you today?'}]

In [12]:
[{"role": m["role"], "content": m["content"]}for m in messages]

[{'role': 'user', 'content': 'hi'},
 {'role': 'assistant', 'content': 'Hello! How can I assist you today?'},
 {'role': 'user', 'content': 'Who are you?'},
 {'role': 'assistant',
  'content': 'I am an AI language model created by OpenAI, called GPT-3. I am here to provide information, answer questions, and engage in conversation. How can I help you today?'}]

In [15]:
conversation_history = []

for message in messages:
    if message['role'] == 'user':
        message['content'] = 'User: ' + message['content']
    elif message['role'] == 'assistant':
        message['content'] = 'Francis: ' + message['content']
    conversation_history.append(message['content'])

In [16]:
modified_data

['user: user: hi',
 'Francis: Francis: Hello! How can I assist you today?',
 'user: user: Who are you?',
 'Francis: Francis: I am an AI language model created by OpenAI, called GPT-3. I am here to provide information, answer questions, and engage in conversation. How can I help you today?']