In [52]:
from dotenv import load_dotenv
from openai import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain_openai import ChatOpenAI 
from langchain.document_loaders import DataFrameLoader
from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.evaluation import load_evaluator

from langchain.globals import set_debug

import pandas as pd
import langdetect as ld
import pinecone


import os

In [5]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
# set_debug(True)

In [6]:
reviews_df = pd.read_csv("SPOTIFY_REVIEWS.csv", index_col=0)

In [135]:
reviews_df.head()

Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp
0,14a011a8-7544-47b4-8480-c502af0ac26f,152618553977019693742,A Google user,Use it every day,5,1,1.1.0.91,2014-05-27 14:21:48
1,bfa8876b-470e-4640-83a7-77427f7f37e8,234382942865437071667,A Google user,"I enjoy the awesome UI of this app, and it has...",5,4,1.1.0.91,2014-05-27 14:36:02
2,70e8252f-058a-47d9-b066-df9e1571c970,174473604608358796368,A Google user,Love it! Especially the new design!,5,2,1.1.0.91,2014-05-27 14:40:01
3,672a155a-e81e-4d28-bdeb-a74c031bc072,286593453219054880269,A Google user,"Awesome UI, best music app out there!",5,1,1.1.0.91,2014-05-27 15:17:20
4,bbc1bf95-ed36-41a1-8b98-0f2e314caea5,167276875678680630145,A Google user,As a professional Android developer I'm glad t...,5,10,1.1.0.91,2014-05-27 15:26:48


In [136]:
reviews_df.shape

(3377423, 8)

Count the (approximate) number of words for each review

In [137]:
reviews_df['review_text_len'] = reviews_df['review_text'].str.split().str.len()

For demo purposes, we will only use the reviews for the app version 8.8.xx.xxx. Additionally, concentrating on a specific version will enhance our comprehension of the data (considering that different versions may exhibit varying review distributions or points of emphasis).

In [142]:
reviews_8_8_df = reviews_df[reviews_df.author_app_version.str.startswith('8.8') == True]
reviews_8_8_df.shape

(286853, 9)

To build our knowledge base, we will include only the reviews with at least 10 words and 3 likes. Longer reviews are more likely to contain detailed information, and reviews with more likes are more likely to be relevant to many users. Alternatively we can also use LLM to filter out reviews that are unlikely to be useful for our analysis.

In [158]:
filtered_reviews_df = reviews_8_8_df.query("review_text_len >=10 and review_likes >= 3").copy()

In [159]:
filtered_reviews_df.shape

(9415, 9)

Only include reviews that are in English

In [160]:
for idx, row in filtered_reviews_df.iterrows():
    try:
        lang = ld.detect(row['review_text'])
    except:
        lang = None
    filtered_reviews_df.loc[idx, 'language'] = lang

In [161]:
filtered_reviews_df = filtered_reviews_df.query("language == 'en'")

In [162]:
filtered_reviews_df.shape

(9288, 10)

In [163]:
filtered_reviews_df.review_text_len.describe()

count    9288.000000
mean       64.987511
std        25.111043
min        10.000000
25%        47.000000
50%        70.000000
75%        86.000000
max       109.000000
Name: review_text_len, dtype: float64

The length of each review is short enough, so we don't need to do any text splitting.

In [164]:
filtered_reviews_df = filtered_reviews_df.drop(columns = ['review_id', 'pseudo_author_id', 'author_name', 'review_text_len', 'language'])

In [165]:
filtered_reviews_df.head()

Unnamed: 0,review_text,review_rating,review_likes,author_app_version,review_timestamp
2950346,It seems like the recent updates broke a few t...,1,146,8.8.0.347,2023-01-13 12:32:35
2950368,This app is amazing I really really like it si...,5,10,8.8.0.347,2023-01-13 13:01:42
2950617,I love Spotify because it's like taking all my...,5,21,8.8.0.347,2023-01-13 18:40:59
2950632,the last update is so bad we can't see preview...,1,3,8.8.0.347,2023-01-13 19:03:33
2950633,I use free Spotify & would rate 5 stars but in...,3,21,8.8.0.347,2023-01-13 19:03:40


Load the reviews in the dataframe into documents. We will use the review text as the document content and the review rating, review likes, app version, and review timestamp as the metadata.

In [166]:
loader = DataFrameLoader(filtered_reviews_df, 'review_text')

In [167]:
documents = loader.load()

Initiate the Pinecone client and create a new index.

In [8]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

index_name = "spotify-reviews"

if index_name not in pinecone.list_indexes():
        pinecone.create_index(name=index_name, metric="cosine", dimension=768)

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
pinecone_index = Pinecone.from_existing_index(index_name, embeddings)

Add all the documents to the Pinecone index.

In [169]:
error_doc = []

for doc in documents:
    try:
        pinecone_index.add_documents([doc])
    except Exception as e:
        error_doc.append(doc)

First we check if the question is flagged as inappropriate by the OpenAI Moderation API. We then use a MultiQueryRetriever to generate three different versions the question, in order to increase the chance of finding relevant reviews through embedding similarity search. For each question, n relevant reviews are retrieved, but the total number of documents returned could be less than 3*n due to duplicates. For our case, having irrelevant reviews injected into the prompt is not a big issue, so we will not perform any reranking or contextual compression. Instead we will sort the reviews by the number of likes, and return the reviews with the highest number of likes as they are more likely to be relevant to many users. 

In [10]:
def _create_qa_chain(llm):
    """Creates a question answering chain with the given language model.
    
    Args:
        llm: The language model to use.
    
    Returns:
        BaseCombineDocumentChain: The question answering chain.
    """
    from langchain.chains.question_answering import load_qa_chain
    
    prompt_template = "Below are some reviews for our music streaming application called Spotify. Answer the question in the end based on the provided reviews. If none of the reviews are not relevant to the question, just say that you don't know, don't try to make up an answer. \n### \nReviews:  \n{context} \n### \nQuestion: \n{question}"
    prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )
    qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
    return qa_chain

def _create_mq_retriever(llm, index, k=20):
    """Creates a multi-query retriever with the given language model and index.
    
    Args:
        llm: The language model to use.
        index: The index to use.
        k (int): The number of documents to retrieve.
    
    Returns:
        MultiQueryRetriever: The multi-query retriever.
    """
    mq_retriever = MultiQueryRetriever.from_llm(
        retriever=index.as_retriever(search_kwargs={"k": k}), llm=llm)
    
    mq_retriever.llm_chain.prompt.template = 'You are an AI language model assistant. Your task is to generate 3 different versions of queries from the user question to retrieve relevant reviews written for our music streaming application called Spotify. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of distance-based similarity search. Provide these alternative queries separated by newlines. \n### User question: {question}'
    
    return mq_retriever

def _flag_input(input):
    """Flag inappropriate input using OpenAI's Moderation API.

    Args:
        input (str): The input to flag.

    Returns:
        bool: True if the input is flagged, False otherwise.
    """
    client = OpenAI()
    response = client.moderations.create(input=input)
    return response.results[0].flagged

def _choose_top_reviews(docs, k):
    """Choose the top k reviews by likes.
    
    Args:
        docs (list): A list of documents.
        k (int): The number of documents to choose.
        
    Returns:
        list: A list of documents.
    """
    idx_likes = {idx: doc.metadata['review_likes'] for idx, doc in enumerate(docs)}
    
    sorted_idx_likes = {k: v for k, v in sorted(idx_likes.items(), key=lambda item: item[1], reverse=True)}
    top_k_idx = list(sorted_idx_likes.keys())[:k]
    
    return [docs[idx] for idx in top_k_idx]
    
def qa_pipeline(llm, index, question, k=40):
    """A question answering pipeline that takes a question and returns an answer synthesized from the relevant reviews retrieved from the index.

    Args:
        question (str): The question to answer.
        final_k (int): The number of reviews to retrieve from the index.

    Returns:
        _type_: _description_
    """
    qa_chain = _create_qa_chain(llm)
    retriever = _create_mq_retriever(llm, index, k=20)
    
    if _flag_input(question):
        return "Your question contains inappropriate content. Please try again."
    
    docs = retriever.get_relevant_documents(question)
    top_reviews = _choose_top_reviews(docs, k)
    
    result = qa_chain(
        {"input_documents": top_reviews, "question": question}, return_only_outputs=False
    )
        
    return result

We would want to inject higher number of reviews for open ended question such as:
1. What are the specific features or aspects that users appreciate the most in our application?
2. In comparison to our applciation, which music streaming platform are users most likely to compare ours with?
3. What are the primary reasons users express dissatisfaction with Spotify?
4. Can you identify emerging trends or patterns in recent user reviews that may impact our product strategy?

GPT-3.5 tend to give short response when the number of input documents (reviews) are long. Therefore, we will use GPT-4. 

In [11]:
llm = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=openai_api_key, temperature=0)

In [12]:
question1 = "What are the specific features or aspects that users appreciate the most in our application?"
result1 = qa_pipeline(llm, pinecone_index, question1)

  warn_deprecated(


In [21]:
print(result1['output_text'])

Based on the provided reviews, users appreciate the following features or aspects of the Spotify application:

1. Music Discovery: Users enjoy the recommendation algorithms, Discover Weekly, Release Radar playlists, and the AI DJ feature that help them find new music and artists that match their tastes.

2. Playlist Functionality: The ability to create, share, and collaborate on playlists is highly valued by users.

3. Offline Listening: The option to download playlists and listen to music offline is appreciated, especially for situations like road trips where there might be no signal.

4. User Interface: Some users find the interface user-friendly, elegant, and easy to navigate, contributing to a positive user experience.

5. Personalized Experience: Personalized playlists and recommendations that align with users' musical preferences are well-liked.

6. Cross-Device Integration: Users appreciate the ability to integrate and play music across different devices.

7. Global Music Select

In [257]:
question2 = "In comparison to our applciation, which music streaming platform are users most likely to compare ours with?"
result2 = qa_pipeline(llm, pinecone_index, question2)

In [258]:
print(result2['output_text'])

Based on the provided reviews, users are most likely to compare your application, Spotify, with the following music streaming platforms:

- Tidal
- Amazon Music
- YouTube Music
- Apple Music

These platforms are mentioned in various reviews as points of comparison for features such as audio quality, user interface, and music selection.


In [46]:
question3 = "What are the primary reasons users express dissatisfaction with Spotify?"
result3 = qa_pipeline(llm, pinecone_index, question3)

In [47]:
print(result3['output_text'])

The primary reasons users express dissatisfaction with Spotify, based on the provided reviews, include:

1. Bugs and Technical Issues: Users report frequent bugs, crashes, and technical glitches that disrupt the app's functionality, such as songs not playing, downloaded songs disappearing, and the app not opening across devices.

2. Ads: Non-premium users complain about an excessive number of ads that interrupt music playback and degrade the user experience. Some premium users also report receiving ads despite paying for the service.

3. Poor Customer Service: Users mention poor customer service experiences, with issues not being resolved even after contacting support, and feeling ignored after reporting problems.

4. Inconsistent Audio Quality: Some users experience varying audio quality, with songs sometimes playing clearly and other times sounding poor.

5. User Interface and Experience: Complaints about the user interface being counterintuitive, difficult to navigate, and the app m

In [None]:
question4 =  "Can you identify emerging trends or patterns in recent user reviews that may impact our product strategy?"
result4 = qa_pipeline(llm, pinecone_index, question4)

In [237]:
print(result4['output_text'])

Based on the provided user reviews, several emerging trends and patterns can be identified that may impact the product strategy for the music streaming application:

1. **User Interface (UI) and User Experience (UX) Concerns**: Many users are expressing dissatisfaction with the UI and UX changes, particularly the new TikTok-style interface, navigation difficulties, and the lack of customization options. Users find the new UI less user-friendly and more focused on visuals rather than functionality.

2. **Feature Requests and Bugs**: Users are requesting specific features such as the ability to swipe down to minimize the now playing screen, reordering songs in playlists, and better shuffle functionality. There are also complaints about bugs and issues with syncing between devices, auto-connect features, and problems with the app's stability and performance.

3. **Ad Frequency and Placement**: Users are frustrated with the frequency and intrusiveness of ads, especially in the free version

Evaluation method with pre-defined rubrics

In [43]:
def eval_with_rubric(llm, question, result):
    
    context = [doc.page_content for doc in result['input_documents']]
    context = "\n\n".join(context)
    
    answer = result['output_text']
    
    
    system_message = SystemMessage(content="You are an AI language model assistant. Your task is to evaluate how well the system answers a user question by looking at the context that the system uses to generate its response.")
    
    human_message = HumanMessage(content="""
###
Question: {question}
###
Context: {context}
###
Answer: {answer}
###
Compare the answer with the provided context. Ignore any differences in style, grammar, or punctuation. Answer the following questions:
- Does the answer include any information outside the context provided? 
- Is the answer relevant to the question?
- Is there any disagreement between the answer and the context?""".format(question=question, context=context, answer=answer))

    messages = [system_message, human_message]
    response = llm(messages)
    return response

In [None]:
eval1 = eval_with_rubric(llm, question1, result1)

In [41]:
print(eval1.content)

- The answer does not include any information outside the context provided. It accurately summarizes the positive aspects of the Spotify application as mentioned in the user reviews.
- The answer is relevant to the question. It directly addresses the question by listing the specific features or aspects that users appreciate the most in the Spotify application.
- There is no disagreement between the answer and the context. The answer correctly reflects the sentiments expressed in the user reviews regarding the features and aspects they enjoy about Spotify.


In [48]:
eval3 = eval_with_rubric(llm, question3, result3)

In [49]:
print(eval3.content)

- The answer does not include any information outside the context provided. It accurately summarizes the complaints and issues mentioned in the user reviews.
- The answer is relevant to the question as it directly addresses the primary reasons for user dissatisfaction with Spotify, which is what the question asked for.
- There is no disagreement between the answer and the context. The answer correctly reflects the sentiments and specific points of dissatisfaction expressed by the users in the provided reviews.


Evaluation method by comparing with reference ideal answer

In [51]:
accuracy_criteria = {
    "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
}

evaluator = load_evaluator(
    "labeled_score_string",
    criteria=accuracy_criteria,
    llm=llm,
)

It's hard to come up with a reference answer for an open-ended question. Therefore, we will not use this method for our evaluation.