In [2]:
"""
Title: Text Embedding and Similarity Analysis

Description:
This script processes and analyzes text data by generating embeddings and calculating cosine similarities 
using OpenAI's models. It includes functionalities for reading data, generating embeddings, computing cosine 
similarities, and searching for the most relevant answers and questions based on a given query.

Author: James Taylor
Date: 06/06/2024

Dependencies:
- numpy
- pandas
- openai
- IPython

Ensure you have the necessary dependencies installed:
pip install numpy pandas openai

Usage:
- Reads data from a CSV file containing precomputed embeddings.
- Initializes the OpenAI client using an API key stored in a text file.
- Defines functions for generating embeddings, calculating cosine similarities, and searching for similar answers and questions.
- Displays the top N most similar answers and questions based on a given query.
"""

import numpy as np
import pandas as pd
import openai
from IPython.display import display

# Read data from a CSV file
try:
    df = pd.read_csv('embedded.csv')
except ValueError as e:
    print(f"Error reading the file: {e}")
else:
    # Display data types of the DataFrame
    print(df.dtypes)

# Display the first few rows of the DataFrame
display(df.head())

# Read API key from a text file
with open('api_key.txt', 'r') as file:
    api_key = file.read().strip()

# Initialize the OpenAI client
openai.api_key = api_key

# Convert embeddings from string representation to numpy arrays
df['answer_embedding'] = df['answer_embedding'].apply(eval).apply(np.array)
df['question_embedding'] = df['question_embedding'].apply(eval).apply(np.array)

def get_embedding(text, model="text-embedding-3-small"):
    """
    Generate an embedding for a given text using a specified model.

    Parameters
    ----------
    text : str
        The input text to be converted into an embedding.
    model : str, optional
        The model to be used for generating the embedding. Default is "text-embedding-3-small".

    Returns
    -------
    list of float
        The embedding vector for the input text.
    """
    text = text.replace("\n", " ")
    return openai.embeddings.create(input=[text], model=model).data[0].embedding

def cosine_similarity(a, b):
    """
    Calculate the cosine similarity between two vectors.

    Parameters
    ----------
    a : list of float
        The first vector for comparison.
    b : list of float
        The second vector for comparison.

    Returns
    -------
    float
        The cosine similarity score between the two vectors, ranging from 0 to 1.
    """
    dot_product = sum(x * y for x, y in zip(a, b))
    magnitude_a = sum(x * x for x in a) ** 0.5
    magnitude_b = sum(y * y for y in b) ** 0.5
    return dot_product / (magnitude_a * magnitude_b)

def search_similar_answers(df, query, n=3, pprint=True):
    """
    Find and return the top N most similar answers based on a query.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame containing the review data. Must include a column 'answer_embedding'
        which contains the embedding vectors of the reviews.
    query : str
        The query to compare against the reviews.
    n : int, optional
        The number of top similar reviews to return. Default is 3.
    pprint : bool, optional
        If True, pretty print the top results. Default is True.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the top N reviews with the highest similarity scores.
    """
    embedding = get_embedding(query, model='text-embedding-3-small')
    df['similarities_answers'] = df['answer_embedding'].apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities_answers', ascending=False).head(n)
    if pprint:
        display(res)
    return res

def search_similar_questions(df, query, n=3, pprint=True):
    """
    Find and return the top N most similar questions based on a query.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame containing the question data. Must include a column 'question_embedding'
        which contains the embedding vectors of the questions.
    query : str
        The query to compare against the questions in the DataFrame.
    n : int, optional
        The number of top similar questions to return. Default is 3.
    pprint : bool, optional
        If True, pretty print the top results. Default is True.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the top N questions with the highest similarity scores.
    """
    embedding = get_embedding(query, model='text-embedding-3-small')
    df['similarities_questions'] = df['question_embedding'].apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values('similarities_questions', ascending=False).head(n)
    if pprint:
        display(res)
    return res

# Example usage
df_answer = search_similar_answers(df, 'open times', n=10)
df_question = search_similar_questions(df, 'open times', n=3)

display(df_answer.head(10))
display(df_question.head())

Question ID            int64
Question              object
Answer                object
answer_embedding      object
question_embedding    object
dtype: object


Unnamed: 0,Question ID,Question,Answer,answer_embedding,question_embedding
0,1,What is the current interest rate for savings?,The current interest rate for savings accounts...,"[-0.029849905520677567, -0.002606721827760339,...","[-0.02708962745964527, -0.019330546259880066, ..."
1,2,How can I open a checking account?,You can open a checking account by visiting an...,"[0.010874899104237556, 0.04530753940343857, 0....","[0.03132950887084007, 0.031158041208982468, 0...."
2,3,What is the minimum balance for a savings acco...,The minimum balance for a savings account is $...,"[0.029620982706546783, 0.01917717047035694, 0....","[0.027614394202828407, 0.018171781674027443, 0..."
3,4,How do I apply for a personal loan?,You can apply for a personal loan online throu...,"[-0.0037767095491290092, 0.015247618779540062,...","[-0.0032004239037632942, -0.002346499124541878..."
4,5,What documents are required to open an account?,"To open an account, you need a valid ID, proof...","[0.0847620889544487, 0.011813902296125889, 0.0...","[0.038944222033023834, 0.0715121254324913, 0.0..."


Unnamed: 0,Question ID,Question,Answer,answer_embedding,question_embedding,similarities_answers
8,9,What are the branch opening hours?,"Our branches are open from 9 AM to 5 PM, Monda...","[-0.01136218011379242, 0.0748688355088234, 0.0...","[-0.03722250834107399, 0.07355938851833344, 0....",0.29957
1,2,How can I open a checking account?,You can open a checking account by visiting an...,"[0.010874899104237556, 0.04530753940343857, 0....","[0.03132950887084007, 0.031158041208982468, 0....",0.234716
18,19,How do I close my account?,"To close your account, visit any of our branch...","[0.04966616630554199, 0.03897934779524803, 0.0...","[0.04915893077850342, 0.014077764004468918, 0....",0.187534
11,12,Can I get a loan to buy a car?,"Yes, we offer auto loans with competitive inte...","[-0.02668355219066143, 0.019259411841630936, 0...","[-0.002607405884191394, -0.03094622679054737, ...",0.1847
15,16,Can I get a statement copy?,"Yes, you can request a statement copy through ...","[0.031010989099740982, -0.023339280858635902, ...","[0.026227407157421112, -0.020656602457165718, ...",0.178007
13,14,How do I update my contact information?,You can update your contact information throug...,"[0.01906469650566578, -0.014860356226563454, 0...","[0.02334478124976158, -0.028389401733875275, 0...",0.169532
9,10,How can I check my account balance?,You can check your account balance through our...,"[0.03639216721057892, 0.0075601255521178246, 0...","[0.05246749520301819, 0.010690983384847641, 0....",0.169029
3,4,How do I apply for a personal loan?,You can apply for a personal loan online throu...,"[-0.0037767095491290092, 0.015247618779540062,...","[-0.0032004239037632942, -0.002346499124541878...",0.159263
4,5,What documents are required to open an account?,"To open an account, you need a valid ID, proof...","[0.0847620889544487, 0.011813902296125889, 0.0...","[0.038944222033023834, 0.0715121254324913, 0.0...",0.142831
5,6,Can I set up automatic bill payments?,"Yes, you can set up automatic bill payments th...","[0.016405565664172173, -0.02754642628133297, -...","[0.0043916646391153336, -0.03269881010055542, ...",0.136826


Unnamed: 0,Question ID,Question,Answer,answer_embedding,question_embedding,similarities_answers,similarities_questions
8,9,What are the branch opening hours?,"Our branches are open from 9 AM to 5 PM, Monda...","[-0.01136218011379242, 0.0748688355088234, 0.0...","[-0.03722250834107399, 0.07355938851833344, 0....",0.29957,0.374231
15,16,Can I get a statement copy?,"Yes, you can request a statement copy through ...","[0.031010989099740982, -0.023339280858635902, ...","[0.026227407157421112, -0.020656602457165718, ...",0.178007,0.211189
4,5,What documents are required to open an account?,"To open an account, you need a valid ID, proof...","[0.0847620889544487, 0.011813902296125889, 0.0...","[0.038944222033023834, 0.0715121254324913, 0.0...",0.142831,0.193357


Unnamed: 0,Question ID,Question,Answer,answer_embedding,question_embedding,similarities_answers
8,9,What are the branch opening hours?,"Our branches are open from 9 AM to 5 PM, Monda...","[-0.01136218011379242, 0.0748688355088234, 0.0...","[-0.03722250834107399, 0.07355938851833344, 0....",0.29957
1,2,How can I open a checking account?,You can open a checking account by visiting an...,"[0.010874899104237556, 0.04530753940343857, 0....","[0.03132950887084007, 0.031158041208982468, 0....",0.234716
18,19,How do I close my account?,"To close your account, visit any of our branch...","[0.04966616630554199, 0.03897934779524803, 0.0...","[0.04915893077850342, 0.014077764004468918, 0....",0.187534
11,12,Can I get a loan to buy a car?,"Yes, we offer auto loans with competitive inte...","[-0.02668355219066143, 0.019259411841630936, 0...","[-0.002607405884191394, -0.03094622679054737, ...",0.1847
15,16,Can I get a statement copy?,"Yes, you can request a statement copy through ...","[0.031010989099740982, -0.023339280858635902, ...","[0.026227407157421112, -0.020656602457165718, ...",0.178007
13,14,How do I update my contact information?,You can update your contact information throug...,"[0.01906469650566578, -0.014860356226563454, 0...","[0.02334478124976158, -0.028389401733875275, 0...",0.169532
9,10,How can I check my account balance?,You can check your account balance through our...,"[0.03639216721057892, 0.0075601255521178246, 0...","[0.05246749520301819, 0.010690983384847641, 0....",0.169029
3,4,How do I apply for a personal loan?,You can apply for a personal loan online throu...,"[-0.0037767095491290092, 0.015247618779540062,...","[-0.0032004239037632942, -0.002346499124541878...",0.159263
4,5,What documents are required to open an account?,"To open an account, you need a valid ID, proof...","[0.0847620889544487, 0.011813902296125889, 0.0...","[0.038944222033023834, 0.0715121254324913, 0.0...",0.142831
5,6,Can I set up automatic bill payments?,"Yes, you can set up automatic bill payments th...","[0.016405565664172173, -0.02754642628133297, -...","[0.0043916646391153336, -0.03269881010055542, ...",0.136826


Unnamed: 0,Question ID,Question,Answer,answer_embedding,question_embedding,similarities_answers,similarities_questions
8,9,What are the branch opening hours?,"Our branches are open from 9 AM to 5 PM, Monda...","[-0.01136218011379242, 0.0748688355088234, 0.0...","[-0.03722250834107399, 0.07355938851833344, 0....",0.29957,0.374231
15,16,Can I get a statement copy?,"Yes, you can request a statement copy through ...","[0.031010989099740982, -0.023339280858635902, ...","[0.026227407157421112, -0.020656602457165718, ...",0.178007,0.211189
4,5,What documents are required to open an account?,"To open an account, you need a valid ID, proof...","[0.0847620889544487, 0.011813902296125889, 0.0...","[0.038944222033023834, 0.0715121254324913, 0.0...",0.142831,0.193357
