In [152]:
import os
import re
import json
import time
import pandas as pd 
import tiktoken
from openai import OpenAI
from urllib.parse import urlparse

script_dir = os.path.dirname(os.getcwd())
encoding = tiktoken.get_encoding("cl100k_base")

In [21]:
# Configuration
MAX_TOKENS = 16385  # Max total tokens for gpt-3.5-turbo-0125
MAX_OUTPUT = 4096
MAX_INPUT = MAX_TOKENS - MAX_OUTPUT
MAX_ITERATIONS = 100  # Max number of iterations to prevent infinite loops
TOKENS_PER_MINUTE_LIMIT = 60000  # TPM rate limit
REQUESTS_PER_MINUTE_LIMIT = 500
REQUESTS_PER_DAY_LIMIT = 10000
model = "gpt-3.5-turbo-0125"  # Latest model update from 01/25/2024
input_cost = 0.0005  # Per 1k tokens
output_cost = 0.0015  # Per 1k tokens

In [7]:
def save_json(output_path, new_data):
    temp_file_path = output_path.strip(".json")+"_temp.json"
    with open(temp_file_path, 'w') as temp_file:
        json.dump(new_data, temp_file, indent=4)

    # Replace the old file with the new file
    os.replace(temp_file_path, output_path)


def get_websites():
    filepath = os.path.join(script_dir, "scraped_data", "company_data", "music_services.csv")
    music_services = pd.read_csv(filepath)
    music_services = music_services['music_services'].tolist()
    return music_services


def estimate_num_tokens_from_str(string, model="gpt-3.5-turbo-0125"):
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: Encoding not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    #print(string)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def estimate_num_tokens_from_msg(message, model="gpt-3.5-turbo-0125"):
    """Returns the number of tokens in a message."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: Encoding not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
        num_tokens = 0
    for key, value in message.items():
        num_tokens += len(encoding.encode(value))
        if key == "name":
            num_tokens += 1
    num_tokens += 6  # 6 tokens reserved for message primer
    return num_tokens

def get_cost(usage_obj, input_cost=0.0005, output_cost=0.0015):
    return ((usage_obj.prompt_tokens/1000) * input_cost) + ((usage_obj.completion_tokens/1000) * output_cost)


def get_response(reviews, prompt):
    """Generate a summary for the given reviews."""
    client = OpenAI()
    response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt + f"{reviews}"}],
            max_tokens=MAX_OUTPUT,
            stop=None,
            n=1
        )
    return response

In [8]:
def run_summarizer(reviews, company, prompt):
    summary = ""
    result = {}
    response_obj = None
    last_request_time = time.time()
    iterations, total_tokens_true, tokens_this_minute, total_cost = 0, 0, 0, 0

    while reviews and (iterations < MAX_ITERATIONS):
        current_batch = []
        tokens_used = estimate_num_tokens_from_str(summary, model) + estimate_num_tokens_from_str(prompt, model) + 7  # +1 for 'role', +6 for message primer

        while reviews and ((tokens_used + estimate_num_tokens_from_str(reviews[0], model)) < MAX_INPUT):
            review = reviews.pop(0)
            current_batch.append(review)
            tokens_used += estimate_num_tokens_from_str(review, model)

        # Check if adding this request would exceed TPM limit
        current_time = time.time()
        if ((tokens_this_minute + tokens_used) > TOKENS_PER_MINUTE_LIMIT):
            sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
            if sleep_time > 0:
                time.sleep(sleep_time)
            tokens_this_minute = 0  # Reset token count for the new minute
            last_request_time = time.time()  # Reset last request time

        if current_batch:
            response_obj = get_response([summary] + current_batch, company, prompt)
            if response_obj:
                batch_summary = response_obj.choices[0].message.content.strip()
                summary = batch_summary if batch_summary else summary
                total_tokens_true += response_obj.usage.total_tokens
                tokens_this_minute += total_tokens_true
                total_cost += get_cost(response_obj.usage)
        iterations += 1

    if response_obj:
        result = {
            "summary_id": response_obj.id,
            "model": response_obj.model,
            "created": response_obj.created,
            "total_tokens_used": total_tokens_true,
            "cost": total_cost,
            "company": company,
            "summary": summary,
        }
    return result


In [9]:
def get_trustpilot_review_summary():
    url_list = get_websites()
    json_list = []
    json_path = os.path.join(script_dir, "GPT_generated_data", "raw_data", "trustpilot_review_summary.json")
    
    for url in url_list:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1)
        input_path = os.path.join(script_dir, "scraped_data", "trustpilot_data", "reviews", f"{company}_clean.csv")

        if os.path.exists(input_path):
            print(company)
            df = pd.read_csv(input_path, usecols = ['review_content'], lineterminator='\n')
            reviews = [review for review in df['review_content'].dropna() if review.strip()]
            prompt = f"Analyze these TrustPilot reviews about {company}. They are a music PR/playlist promotion company. Generate a paragraph-long summary that focuses on customer opinions and experiences without naming any other companies. Do not bring up industries or topics outside of music or music PR/playlist promotion. Highlight what customers appreciate and any concerns they have expressed. Use phrases like 'Customers appreciate...' and 'They mention concerns about...' to ensure the summary is customer-centric. Additionally, categorize the feedback into pros and cons without making up any information or referring to specific alternatives. Here are the reviews:"
            result = run_summarizer(reviews, company, prompt)
            json_list.append(result)
            save_json(json_path, json_list)
            print(result)

    df = pd.DataFrame(json_list)
    return df


def get_reddit_submission_summary():
    url_list = get_websites()
    json_list = []
    json_path = os.path.join(script_dir, "GPT_generated_data", "raw_data", "reddit_submission_summary.json")
    
    for url in url_list:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1)
        input_path = os.path.join(script_dir, "scraped_data", "reddit_data", "submissions", "clean", f"{company}_clean.csv")

        if os.path.exists(input_path):
            print(company)
            df = pd.read_csv(input_path, usecols = ['content_clean'], lineterminator='\n')
            reviews = [review for review in df['content_clean'].dropna() if review.strip()]
            prompt = f"Analyze these Reddit posts about {company}. They are a music PR/playlist promotion company. Generate a paragraph-long summary that focuses on customer opinions and experiences without naming any other companies. Do not bring up industries or topics outside of music or music PR/playlist promotion. Highlight what customers appreciate and any concerns they have expressed. Use phrases like 'Customers appreciate...' and 'They mention concerns about...' (switch up the order occasionally) to ensure the summary is customer-centric. Additionally, categorize the feedback into pros and cons without making up any information or referring to specific alternatives. Here are the reviews:"
            result = run_summarizer(reviews, company, prompt)
            json_list.append(result)
            save_json(json_path, json_list)
            print(result)

    df = pd.DataFrame(json_list)
    return df


def get_reddit_comment_summary():
    url_list = get_websites()
    json_list = []
    json_path = os.path.join(script_dir, "GPT_generated_data", "raw_data", "reddit_comment_summary.json")
    
    for url in url_list:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1)
        input_path = os.path.join(script_dir, "scraped_data", "reddit_data", "comments", "clean", f"{company}_clean.csv")

        if os.path.exists(input_path):
            print(company)
            df = pd.read_csv(input_path, usecols = ['body_clean'], lineterminator='\n')
            reviews = [review for review in df['body_clean'].dropna() if review.strip()]
            prompt = f"Analyze these Reddit comments about {company}. They are a music PR/playlist promotion company. Generate a paragraph-long summary that focuses on customer opinions and experiences without naming any other companies or competitors. Do not bring up industries or topics outside of music or music PR/playlist promotion. Highlight what customers appreciate and any concerns they have expressed. Use phrases like 'Customers appreciate...' and 'They mention concerns about...' (switch up the order occasionally) to ensure the summary is customer-centric. Additionally, categorize the feedback into pros and cons. If there is not enough material to create a sufficiently long summary, just make it shorter. Do not make up any information or mention anything that was not explicitly mentioned in the comments provided. Here are the comments:"
            result = run_summarizer(reviews, company, prompt)
            json_list.append(result)
            save_json(json_path, json_list)
            print(result)

    df = pd.DataFrame(json_list)
    return df

In [10]:
def get_keywords_from_summary(input_path, output_path):
    json_list = []
    tokens_this_minute = 0
    last_request_time = time.time()

    with open(input_path, 'r') as file:
        data = json.load(file)

    for data_entry in data:
        company = data_entry["company"]
        summary = data_entry["summary"]

        keyword_prompt = f"""
        Given a summary of reviews about a music PR/playlist promotion company, called {company}, generate a short list of keywords that indicate a positive, neutral, or negative quality. If no keywords match a certain category, feel free to leave out that category. Do not make up any information or bring up keywords that were not explicitly mentioned in the summary.

        Example 1:
        Summary: Customers like the color of the television. They say that the colors are bold and realistic, with deep blacks. Customers also appreciate the picture quality, overall quality, and value of the product. However, some customers have different opinions on brightness, sound quality, performance, and ease of setup.
        Positive keywords: Picture quality, Quality, Color, Value
        Neutral keywords: Performance, Ease of setup, Brightness, Sound quality
        Negative keywords: N/A

        Example 2: 
        Summary: Customers like the appearance of the headphones. They say that the recycled plastic look has its own charm, and it doesn't make you look stupid. Customers also appreciate the ring shape and open design. The sound quality is better than expected, and the headphones are comfortable to wear. However, some customers are disappointed with stability. Customers have mixed opinions on battery life, quality, sound quality, performance, fit, and comfort.
        Positive keywords: Appearance
        Neutral keywords: Sound quality, Comfort, Fit, Battery life, Performance, Quality
        Negative keywords: Stability

        Here is the summary that you will be extracting keywords from: 
        "{summary}"
        """

        ### Implement logic to handle rate limits
        tokens_used = estimate_num_tokens_from_str(summary, model) + estimate_num_tokens_from_str(keyword_prompt, model) + 7

        # Check if adding this request would exceed TPM limit
        current_time = time.time()
        if ((tokens_this_minute + tokens_used) > TOKENS_PER_MINUTE_LIMIT):
            sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
            if sleep_time > 0:
                time.sleep(sleep_time)
            tokens_this_minute = 0  # Reset token count for the new minute
            last_request_time = time.time()  # Reset last request time

        client = OpenAI()
        response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": keyword_prompt}],
                max_tokens=MAX_OUTPUT,
                stop=None,
                n=1
            )

        if response:
            keywords = response.choices[0].message.content.strip()
            total_tokens = response.usage.total_tokens
            total_cost = get_cost(response.usage)
            tokens_this_minute += total_tokens

            json_entry = {"summary_id": response.id,
                          "model": response.model,
                          "created": response.created,
                          "total_tokens_used": total_tokens, 
                          "cost": total_cost, 
                          "company": company, 
                          "summary": summary, 
                          "keywords": keywords}
            print(json_entry)

            json_list.append(json_entry)
            save_json(output_path, json_list)


## Refining prompts and models

In [90]:
def gpt_filtering():
    tokens_this_minute, requests_this_minute, tokens_used, total_cost, total_requests = 0, 0, 0, 0, 0
    last_request_time = time.time()
    websites = get_websites()
    #websites = websites[4:]  ############### remove this after running
    for site in websites:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(site).netloc).group(1)

        sub_df, comment_df = None, None
        submission_path = os.path.join(script_dir, "scraped_data", "reddit_data", "submissions", "clean", f"{company}_clean.csv")
        comment_path = os.path.join(script_dir, "scraped_data", "reddit_data", "comments", "clean", f"{company}_clean.csv")

        if os.path.exists(submission_path):
            sub_df = pd.read_csv(submission_path, lineterminator='\n')
            sub_df.loc[:, 'content_clean'] = sub_df['content_clean'].fillna('')
        if os.path.exists(comment_path):
            comment_df = pd.read_csv(comment_path, lineterminator='\n')
            comment_df.loc[:, 'body_clean'] = comment_df['body_clean'].fillna('')
        
        if (sub_df is not None) and (comment_df is not None):
            new_range = [0,1]
        elif sub_df is not None:
            new_range = [0]
        elif comment_df is not None:
            new_range = [1]
        else:
            continue

        for i in new_range:
            filtered_list = []
            if i == 0:
                sub_df.loc[:, "combined"] = sub_df["title_clean"] + ': ' + sub_df["content_clean"]
                column = sub_df["combined"]
            else:
                column = comment_df["body_clean"]

            for text_content in column:

                filter_prompt = f"""
                Given this text, did the author actually use the service “{company}” and shared their experience/opinion about it? Return “True” if yes and “False” if not. Please follow this exact format. 

                Do not answer “True” if the author is merely inquiring about the service, if their statement relies on hearsay, if they used the service but did not offer any opinion on it, if the text sounds like a guide, or if they are only advertising their code/links. 
                Additionally, answer “False” if the text appears to be about a topic unrelated to music careers, music PR, or playlist promotion.

                Text: {text_content}

                Answer:
                """

                #print(filter_prompt)

                ### Implement logic to handle rate limits
                estimate = estimate_num_tokens_from_str(text_content, model) + estimate_num_tokens_from_str(filter_prompt, model) + 7

                # Check if adding this request would exceed TPM limit
                current_time = time.time()
                if ((tokens_this_minute + estimate) > TOKENS_PER_MINUTE_LIMIT) or ((requests_this_minute + 1) > REQUESTS_PER_MINUTE_LIMIT):
                    sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
                    if sleep_time > 0:
                        time.sleep(sleep_time)
                    tokens_this_minute = 0  # Reset token count for the new minute
                    requests_this_minute = 0
                    last_request_time = time.time()  # Reset last request time

                client = OpenAI()
                response = client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": filter_prompt}],
                        max_tokens=MAX_OUTPUT,
                        stop=None,
                        n=1
                    )
                
                if response:
                    total_cost += get_cost(response.usage)
                    tokens_this_minute += response.usage.total_tokens
                    requests_this_minute += 1
                    tokens_used += response.usage.total_tokens
                    filtered_list.append(response.choices[0].message.content.strip())
                    total_requests += 1
                    print(f"Estimated tokens used: {estimate}")
                    print(f"Actual tokens used: {response.usage.total_tokens}")
                    print(f"Total tokens used so far: {tokens_used}")
                    print(f"Total requests so far: {total_requests}")
                    print(f"Total cost so far: {total_cost}")
                    print()

            if i == 0:
                sub_df.loc[:, "gpt_filter"] = filtered_list
                sub_df.to_csv(os.path.join(script_dir, f"scraped_data/reddit_data/submissions/train/{company}_train.csv"), index=False)
            else:
                comment_df.loc[:, "gpt_filter"] = filtered_list
                comment_df.to_csv(os.path.join(script_dir, f"scraped_data/reddit_data/comments/train/{company}_train.csv"), index=False)

    return sub_df, comment_df

In [119]:
def gpt_filtering2():
    # Configuration
    MAX_TOKENS = 128000  # Max total tokens for gpt-4-0125-preview
    MAX_OUTPUT = 4096
    MAX_INPUT = MAX_TOKENS - MAX_OUTPUT
    MAX_ITERATIONS = 100  # Max number of iterations to prevent infinite loops
    TOKENS_PER_MINUTE_LIMIT = 300000  # TPM rate limit
    REQUESTS_PER_MINUTE_LIMIT = 500
    model = "gpt-4-0125-preview"  # Latest model update from 01/25/2024
    input_cost = 0.01  # Per 1k tokens
    output_cost = 0.03  # Per 1k tokens

    tokens_this_minute, requests_this_minute, tokens_used, total_cost, total_requests = 0, 0, 0, 0, 0
    last_request_time = time.time()
    websites = get_websites()
    websites = websites[1:]
    for site in websites:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(site).netloc).group(1)

        sub_df, comment_df = None, None
        submission_path = os.path.join(script_dir, "scraped_data", "reddit_data", "submissions", "train", f"{company}_train.csv")
        comment_path = os.path.join(script_dir, "scraped_data", "reddit_data", "comments", "train", f"{company}_train.csv")

        if os.path.exists(submission_path):
            sub_df = pd.read_csv(submission_path, lineterminator='\n')
            sub_df.loc[:, 'combined'] = sub_df['combined'].fillna('')
        if os.path.exists(comment_path):
            comment_df = pd.read_csv(comment_path, lineterminator='\n')
            comment_df.loc[:, 'body_clean'] = comment_df['body_clean'].fillna('')
        
        if (sub_df is not None) and (comment_df is not None):
            new_range = [0,1]
        elif sub_df is not None:
            new_range = [0]
        elif comment_df is not None:
            new_range = [1]
        else:
            continue

        print(company)

        for i in new_range:
            filtered_list = []
            if i == 0:
                column = sub_df["combined"]
                zipped_columns = zip(column, sub_df["gpt_filter"])
            else:
                column = comment_df["body_clean"]
                zipped_columns = zip(column, comment_df["gpt_filter"])

            for text_content, filter_value in zipped_columns:
                if filter_value == False or text_content == "":
                    filtered_list.append('False')
                    continue

                filter_prompt = f"""
                Imagine that this review is a TrustPilot review for a music PR/playlist promotion service named “{company}”. A helpful review must elaborate on its effectiveness, usability, customer service, or any aspect that would help a reader make an informed decision. If multiple services are named, focus only on the part that is explicitly talking about {company}. An unhelpful review is vague, emotionally charged, biased, is poorly written, includes too much irrelevant content, or makes mostly broad and generalized statements. Return “True” if the review would most likely be rated “helpful” by other users and return “False” if the review would most likely be rated as “unhelpful” by other users. Follow this exact format.
                
                Review: {text_content}
                
                Answer: 
                """

                #print(filter_prompt)

                ### Implement logic to handle rate limits
                estimate = estimate_num_tokens_from_str(text_content, model) + estimate_num_tokens_from_str(filter_prompt, model) + 8

                # Check if adding this request would exceed TPM limit
                current_time = time.time()
                if ((tokens_this_minute + estimate) > TOKENS_PER_MINUTE_LIMIT) or ((requests_this_minute + 1) > REQUESTS_PER_MINUTE_LIMIT):
                    sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
                    if sleep_time > 0:
                        time.sleep(sleep_time)
                    tokens_this_minute = 0  # Reset token count for the new minute
                    requests_this_minute = 0
                    last_request_time = time.time()  # Reset last request time

                client = OpenAI()
                response = client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": filter_prompt}],
                        max_tokens=MAX_OUTPUT,
                        stop=None,
                        n=1
                    )
                
                if response:
                    total_cost += get_cost(response.usage, input_cost, output_cost)
                    tokens_this_minute += response.usage.total_tokens
                    requests_this_minute += 1
                    tokens_used += response.usage.total_tokens
                    filtered_list.append(response.choices[0].message.content.strip())
                    total_requests += 1
                    print(f"Estimated input tokens used: {estimate}")
                    print(f"Actual input tokens used: {response.usage.prompt_tokens}")
                    print(f"Actual total tokens used: {response.usage.total_tokens}")
                    print(f"Total tokens used so far: {tokens_used}")
                    print(f"Total requests so far: {total_requests}")
                    print(f"Total cost so far: {total_cost}")
                    print()

            if i == 0:
                sub_df.loc[:, "gpt_filter2"] = filtered_list
                sub_df.to_csv(os.path.join(script_dir, f"scraped_data/reddit_data/submissions/train/{company}_train.csv"), index=False)
            else:
                comment_df.loc[:, "gpt_filter2"] = filtered_list
                comment_df.to_csv(os.path.join(script_dir, f"scraped_data/reddit_data/comments/train/{company}_train.csv"), index=False)

    return sub_df, comment_df

In [146]:
def get_gpt4turbo_summary(overwrite=False):
    # OpenAI API Configuration
    MAX_TOKENS = 128000  # Max total tokens for gpt-4-0125-preview
    MAX_OUTPUT = 4096
    MAX_INPUT = MAX_TOKENS - MAX_OUTPUT
    TOKENS_PER_MINUTE_LIMIT = 300000  # TPM rate limit
    REQUESTS_PER_MINUTE_LIMIT = 500
    model = "gpt-4-0125-preview"  # Latest model update from 01/25/2024
    input_cost = 0.01  # Per 1k tokens
    output_cost = 0.03  # Per 1k tokens

    # Initializing variables
    last_request_time = time.time()
    total_tokens_this_session, tokens_this_minute, total_cost_this_session, requests_this_minute, total_requests_this_session = 0, 0, 0, 0, 0
    json_path = os.path.join(script_dir, "GPT_generated_data", "new_summaries", "raw_data", "reddit_summary_keywords.json")

    # Getting list of companies
    #url_list = get_websites() ################################## edit this
    url_list = ["https://soundbetter.com/"]
    company_names = [re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1) for url in url_list]

    # If JSON already exists and overwrite is FALSE, load file
    if os.path.exists(json_path) and not overwrite:
        with open(json_path, 'r') as file:
            json_data = json.load(file)
        skip_list = [entry.get("company") for entry in json_data]  # Check which companies already have a summary
        company_names = [company for company in company_names if company not in skip_list]
    else:
        json_data = []
    
    for company in company_names:
        # Import reddit data sets and remove irrelevant rows
        submissions_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/submissions/train/{company}_train.csv")
        comments_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/comments/train/{company}_train.csv")
        if os.path.exists(submissions_filepath) and os.path.exists(comments_filepath):
            sub_df = pd.read_csv(submissions_filepath, usecols = ['combined', 'gpt_filter2'], lineterminator='\n')
            sub_df = sub_df[sub_df["gpt_filter2"] == True]
            comment_df = pd.read_csv(comments_filepath, usecols = ['body_clean', 'gpt_filter2'], lineterminator='\n')
            comment_df = comment_df[comment_df["gpt_filter2"] == True]
        else:
            print("DEBUG: continue")
            continue

        # Combine all reddit posts and comments into a single list
        posts = [post for post in sub_df['combined'].dropna()]
        comments = [comment for comment in comment_df['body_clean'].dropna()]
        reviews = posts + comments

        summary = ""
        result = {}
        response = None
        prompt = f"""
        Analyze these Reddit posts and comments about a music PR/playlist promotion company named {company}. Generate a paragraph-long summary that focuses on customer opinions/concerns/experiences regarding {company}. Additionally, list out the most frequently mentioned aspects about the company, categorized as positive, negative, or neutral. Each aspect should be summarized in 1-2 words, ensuring that synonyms or similar variants are consolidated under a single term that best represents the sentiment expressed across mentions. For example, if "high costs" and "expensive" are used interchangeably but "high costs" is more common, use "high costs" for the negative aspects category. If an aspect could be interpreted in multiple ways (positive, negative, neutral), categorize it based on the overall sentiment it most commonly aligns with in the context of these reviews. Avoid listing the same aspect or closely related aspects (including synonyms or near-synonyms) in more than one category. Return a JSON object consisting of "summary", "positive_aspects", "negative_aspects", and "neutral_aspects".
        
        Please note:
        - Keep in mind that some users may refer to multiple different services in the same post. Thus, only consider parts of the text that are explicitly referring to {company}. Ignore mentions of other services or irrelevant discussions.
        - Do not mention other companies or services directly in your summary. 
        - Avoid fabricating information or introducing unrelated topics.

        Example output (formatted as a valid JSON):
        {{
            "summary": "Customers appreciate <company name> for its user-friendly platform, constructive feedback, relationship-building opportunities, organic stream growth, and playlist credibility. However, they raise concerns about high pricing, genre mismatches, limited reach for certain music types, and inconsistent campaign outcomes, including ineffective genre targeting and disappointing return on investment. Suggestions for improvement include refining the playlist matching process and enhancing the service to accommodate a broader range of music genres, aiming to increase successful playlist adds and exposure.",
            "positive_aspects": ["User-friendly platform", "Constructive feedback", "Relationship-building", "Playlist credibility"],
            "negative_aspects": ["Pricing", "Reach", "Campaign outcomes", "Genre targeting", "Engagement", "Return on investment"],
            "neutral_aspects": ["Playlist placements", "Stream growth", "Exposure"]
        }}
        
        Here are the posts you will be analyzing: {reviews}

        \n```json
        """

        # Calculating estimated number of tokens
        estimate = estimate_num_tokens_from_str(prompt, model) + 7  # +1 for 'role', +6 for message primer
        for review in reviews:
            estimate += estimate_num_tokens_from_str(review, model)

        # If estimated input tokens exceeds limit, throw warning
        if estimate > MAX_INPUT:
            raise ValueError(f"WARNING: Estimated number of input tokens for {company} is {estimate}, which potentially exceeds the limit of {MAX_INPUT} tokens.")

        # Check if this request would exceed TPM or RPM limit
        current_time = time.time()
        if ((tokens_this_minute + estimate) > TOKENS_PER_MINUTE_LIMIT) or ((requests_this_minute + 1) > REQUESTS_PER_MINUTE_LIMIT):
            sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
            if sleep_time > 0:
                time.sleep(sleep_time)
            tokens_this_minute = 0  # Reset token count for the new minute
            requests_this_minute = 0
            last_request_time = time.time()  # Reset last request time

        # Generate summary
        client = OpenAI()
        response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt + f"{reviews}"}],
                max_tokens=MAX_OUTPUT,
                stop=None,
                n=1,
                response_format= {
                    "type": "json_object"
                    }
            )
        if response:
            generated_json = response.choices[0].message.content
            print(generated_json)
            generated_json = json.loads(generated_json)
            generated_summary = generated_json['summary']
            generated_positive_keywords = generated_json['positive_aspects']
            generated_negative_keywords = generated_json['negative_aspects']
            generated_neutral_keywords = generated_json['neutral_aspects']
            summary = generated_summary if generated_summary else summary

            tokens_this_minute += response.usage.total_tokens
            total_tokens_this_session += tokens_this_minute
            total_cost_this_session += get_cost(response.usage, input_cost, output_cost)
            requests_this_minute += 1
            total_requests_this_session += 1

        # Save response object as JSON
        if response:
            result = {
                "summary_id": response.id,
                "model": response.model,
                "created": response.created,
                "total_tokens_used": response.usage.total_tokens,
                "cost": get_cost(response.usage, input_cost, output_cost),
                "company": company,
                "summary": summary,
                "positive_keywords": generated_positive_keywords,
                "negative_keywords": generated_negative_keywords,
                "neutral_keywords": generated_neutral_keywords,
            }
            json_data.append(result)
            save_json(json_path, json_data)

            print(f"####### SUMMARY FOR {company} #######")
            print(summary)
            print("####### DEBUG PURPOSES #######")
            print(f"Estimated input tokens used: {estimate}") 
            print(f"Actual input tokens used: {response.usage.prompt_tokens}")
            print("####### CURRENT USAGE #######")
            print(f"Company: {company}")
            print(f"Tokens used for this summary: {response.usage.total_tokens}")
            print(f"Cost of this summary: {get_cost(response.usage, input_cost, output_cost)}")
            print("####### SESSION STATS #######")
            print(f"Total tokens used so far: {total_tokens_this_session}")
            print(f"Total requests so far: {total_requests_this_session}")
            print(f"Total cost so far: {total_cost_this_session}")
            print()


In [145]:
get_gpt4turbo_summary()

{
    "summary": "Soundbetter, a music PR/playlist promotion company, receives a mix of feedback from customers. While some users praise the platform for its functionality and the opportunities it offers for connection and collaboration with professionals, others express frustration with various aspects of the service. Issues cited include difficulties with the platform's user interface, lack of development or updates over the years, and challenges in the communication process between clients and service providers. Users also discuss the pricing, with opinions divided on whether the costs are justified by the benefits received. Despite these concerns, many still regard Soundbetter as a valuable resource for finding quality session musicians, producers, and engineers, highlighting its role in supporting projects ranging from single tracks to entire albums.",
    "positive_aspects": ["Professional connections", "High-quality service", "Platform functionality"],
    "negative_aspects": ["

## Getting Summaries

In [54]:
trustpilot_summary_df = get_trustpilot_review_summary()
trustpilot_summary_df

playlistpush
estimate_num_tokens(reviews[0], encoding): 69
** Before sending request **
Total tokens used this entire session: 0
Tokens used this minute: 0
Estimated number of tokens used for current batch: 11182
Prompt: Analyze these TrustPilot reviews about playlistpush. They are a music PR/playlist promotion company. Generate a paragraph-long summary that focuses on customer opinions and experiences without naming any other companies. Do not bring up industries or topics outside of music or music PR/playlist promotion. Highlight what customers appreciate and any concerns they have expressed. Use phrases like 'Customers appreciate...' and 'They mention concerns about...' to ensure the summary is customer-centric. Additionally, categorize the feedback into pros and cons without making up any information or referring to specific alternatives. Here are the reviews:
** After sending request **
Estimated number of tokens used for current batch: 11182
Actual number of tokens used for curre

Unnamed: 0,summary_id,model,created,tokens_used,total_tokens_used,cost,company,summary
0,chatcmpl-8x5Gy6VY359YXgCEhEHsZuM5UzXjI,gpt-3.5-turbo-0125,1709092312,"CompletionUsage(completion_tokens=119, prompt_...",87782,$0.0,playlistpush,Customers appreciate PlaylistPush for its user...
1,chatcmpl-8x5H1vyQWkCZ0cGLT9ggSzq6rpqkc,gpt-3.5-turbo-0125,1709092315,"CompletionUsage(completion_tokens=68, prompt_t...",207,$0.0,indiemono,Customers appreciate the potential of the comp...
2,chatcmpl-8x5H6CgffjLLQgAIl7aNubsaJw4SZ,gpt-3.5-turbo-0125,1709092320,"CompletionUsage(completion_tokens=154, prompt_...",7481,$0.0,starlightpr1,Customers appreciate the comprehensive music p...
3,chatcmpl-8x5H95qmiJoZ8NyrWjP3PCenuIuMd,gpt-3.5-turbo-0125,1709092323,"CompletionUsage(completion_tokens=107, prompt_...",519,$0.0,planetarygroup,Customers appreciate the convenience of being ...
4,chatcmpl-8x5IJ49Xz2zzn2NWRZpHB3mOUyGXv,gpt-3.5-turbo-0125,1709092395,"CompletionUsage(completion_tokens=150, prompt_...",24823,$0.0,groover,Customers appreciate Groover's user-friendly p...
5,chatcmpl-8x5MVCdRTCsV8e3rmU0TrcUd5vlgh,gpt-3.5-turbo-0125,1709092655,"CompletionUsage(completion_tokens=190, prompt_...",64886,$0.0,soundcamps,Customers appreciate soundcamps for its user-f...
6,chatcmpl-8x5MZEbKeO7RDn4ttHrUwLdqoL4Io,gpt-3.5-turbo-0125,1709092659,"CompletionUsage(completion_tokens=111, prompt_...",3600,$0.0,indiemusicacademy,Customers appreciate the personalized approach...
7,chatcmpl-8x5MhinncbtnL7G0b8ydMDT7EuPdW,gpt-3.5-turbo-0125,1709092667,"CompletionUsage(completion_tokens=113, prompt_...",8688,$0.0,daimoon,Customers appreciate Daimoon for their efficie...
8,chatcmpl-8x5WhFqrY7ELh2s6M91nW5ypttKFN,gpt-3.5-turbo-0125,1709093287,"CompletionUsage(completion_tokens=166, prompt_...",14188,$0.0,boost-collective,Customers appreciate Boost Collective for thei...
9,chatcmpl-8x5ayflpxHfPCAhRbflxUKWOCJv9x,gpt-3.5-turbo-0125,1709093552,"CompletionUsage(completion_tokens=157, prompt_...",93904,$0.0,omarimc,Customers appreciate omarimc for their persona...


In [68]:
reddit_submission_summary_df = get_reddit_submission_summary()
reddit_submission_summary_df

playlistpush
{'summary_id': 'chatcmpl-8xKJbFMpC7PNhQFrQyxKCo8ASTvYY', 'model': 'gpt-3.5-turbo-0125', 'created': 1709150135, 'tokens_used': CompletionUsage(completion_tokens=143, prompt_tokens=10336, total_tokens=10479), 'total_tokens_used': 32285, 'cost': 0.0165585, 'company': 'playlistpush', 'summary': "Customers appreciate PlaylistPush for the potential to reach new listeners and expose their music to wider audiences. They value curated playlists and feedback from curators, noting the platform's supportive nature in promoting music and providing exposure opportunities. However, concerns have been raised about the effectiveness of playlist curation, high costs compared to results achieved, and the quality and relevance of feedback received. Some customers express disappointment and skepticism about the authenticity of services utilized. Despite mixed reviews, PlaylistPush is appreciated for its value and potential to access desired playlists for music promotion. Customers also mention

Unnamed: 0,summary_id,model,created,tokens_used,total_tokens_used,cost,company,summary
0,chatcmpl-8xKJbFMpC7PNhQFrQyxKCo8ASTvYY,gpt-3.5-turbo-0125,1709150135,"CompletionUsage(completion_tokens=143, prompt_...",32285,0.016559,playlistpush,Customers appreciate PlaylistPush for the pote...
1,chatcmpl-8xKJfE7GvgeWdxsbvilnv6DYjgfiC,gpt-3.5-turbo-0125,1709150139,"CompletionUsage(completion_tokens=130, prompt_...",8596,0.004428,indiemono,Customers appreciate the convenience and varie...
2,chatcmpl-8xKJqZiPXGlBLraJbBx1CV0EQWYjr,gpt-3.5-turbo-0125,1709150150,"CompletionUsage(completion_tokens=149, prompt_...",33074,0.016937,planetarygroup,Customers appreciate the services provided by ...
3,chatcmpl-8xKK8Bb5N8UTWcXc0DYcPixjuC5eW,gpt-3.5-turbo-0125,1709150168,"CompletionUsage(completion_tokens=138, prompt_...",26323,0.013595,groover,Customers appreciate Groover for its affordabl...
4,chatcmpl-8xKKKcN8rQUNByhxioAGgQzSbQtwt,gpt-3.5-turbo-0125,1709150180,"CompletionUsage(completion_tokens=155, prompt_...",11073,0.005691,soundcamps,Customers appreciate Soundcampaign for increas...
5,chatcmpl-8xKKOaq6TROtNdhfyde90AYtuKZAQ,gpt-3.5-turbo-0125,1709150184,"CompletionUsage(completion_tokens=118, prompt_...",1483,0.00086,indiemusicacademy,Customers appreciate the opportunity to promot...
6,chatcmpl-8xKKRo5cFQj8hGI15RqMgFh1Rlar3,gpt-3.5-turbo-0125,1709150187,"CompletionUsage(completion_tokens=121, prompt_...",3034,0.001638,daimoon,Customers appreciate the quick and effective d...
7,chatcmpl-8xKKUFKC9RWkDNke8OsVu58WWwuY7,gpt-3.5-turbo-0125,1709150190,"CompletionUsage(completion_tokens=142, prompt_...",1250,0.000767,boost-collective,Customers appreciate the variety of free vocal...
8,chatcmpl-8xKKXof36LO9HNtk4IDGrluvUEAPj,gpt-3.5-turbo-0125,1709150193,"CompletionUsage(completion_tokens=151, prompt_...",3459,0.00188,omarimc,Customers appreciate OmariMC for delivering on...
9,chatcmpl-8xKNoYy8orXhPZ4mhiIADtpBVNlil,gpt-3.5-turbo-0125,1709150396,"CompletionUsage(completion_tokens=154, prompt_...",203396,0.104482,submithub,Customers appreciate Submithub for providing v...


In [92]:
reddit_comment_summary_df = get_reddit_comment_summary()
reddit_comment_summary_df

soundbetter
{'summary_id': 'chatcmpl-8xfrnboG06vVe4kRgjrZzPlg6uFOJ', 'model': 'gpt-3.5-turbo-0125', 'created': 1709232979, 'total_tokens_used': 174904, 'cost': 0.090004, 'company': 'soundbetter', 'summary': "Customers appreciate SoundBetter for its reliable platform that offers skilled music professionals for mixing, mastering, and collaborating. Users value the user-friendly interface, quality, professionalism, and affordability that SoundBetter provides. Many recommend SoundBetter to find experienced music professionals, highlighting pros such as the range of options, talent available, and connections made through the platform. However, concerns about pricing, visibility in a competitive space, and standing out in a crowded ecosystem have been expressed. Some worry about pricing competitiveness, user experience, and limitations in fulfilling client demands just to secure positive reviews. While there are mixed experiences regarding the number of gigs acquired, customers emphasize the

Unnamed: 0,summary_id,model,created,total_tokens_used,cost,company,summary
0,chatcmpl-8xfrnboG06vVe4kRgjrZzPlg6uFOJ,gpt-3.5-turbo-0125,1709232979,174904,0.090004,soundbetter,Customers appreciate SoundBetter for its relia...


## Generating keywords

In [79]:
keyword_path = os.path.join(script_dir, "GPT_generated_data")

reddit_submission_keywords = get_keywords_from_summary(os.path.join(keyword_path, "reddit_submission_summary.json"), os.path.join(keyword_path, "reddit_submission_keywords.json"))

{'summary_id': 'chatcmpl-8xOzMkDTt4fewAJ4DsTCEMdUnynBw', 'model': 'gpt-3.5-turbo-0125', 'created': 1709168100, 'total_tokens_used': 536, 'cost': 0.00034700000000000003, 'company': 'playlistpush', 'original_summary': "Customers appreciate PlaylistPush for the potential to reach new listeners and expose their music to wider audiences. They value curated playlists and feedback from curators, noting the platform's supportive nature in promoting music and providing exposure opportunities. However, concerns have been raised about the effectiveness of playlist curation, high costs compared to results achieved, and the quality and relevance of feedback received. Some customers express disappointment and skepticism about the authenticity of services utilized. Despite mixed reviews, PlaylistPush is appreciated for its value and potential to access desired playlists for music promotion. Customers also mention experiences with other platforms like SubmitHub, Soundplate, and Yougrow, highlighting t

In [94]:
reddit_comment_keywords = get_keywords_from_summary(os.path.join(keyword_path, "reddit_comment_summary.json"), os.path.join(keyword_path, "reddit_comment_keywords.json"))

{'summary_id': 'chatcmpl-8xfubghRByzMXYFTAL9MEcAcVcsqU', 'model': 'gpt-3.5-turbo-0125', 'created': 1709233153, 'total_tokens_used': 567, 'cost': 0.0003525, 'company': 'playlistpush', 'original_summary': "Customers appreciate Playlistpush for its reliable abilities to place songs on playlists and help artists gain authentic streams while upholding transparency and avoiding bot-driven playlists. They find it instrumental in enhancing music promotion efforts, embracing Campaign success, including real growth and engagement. Some express satisfaction in utilizing multiple platforms for music promotion. However, concerns arise over the high costs associated with services and inconsistencies in playlist placements. There is a call for caution regarding potentially subpar results for songs that are not top-notch, emphasizing the indispensable aspects of persistence and patience in the music industry. Despite challenges and growing options, Playlistpush is generally valued by customers for its

In [80]:
trustpilot_review_keywords = get_keywords_from_summary(os.path.join(keyword_path, "trustpilot_review_summary.json"), os.path.join(keyword_path, "trustpilot_review_keywords.json"))

{'summary_id': 'chatcmpl-8xP3ZRjqWIjzM0JD6NtWQhkXICmjD', 'model': 'gpt-3.5-turbo-0125', 'created': 1709168361, 'total_tokens_used': 514, 'cost': 0.00033800000000000003, 'company': 'playlistpush', 'original_summary': 'Customers appreciate PlaylistPush for its user-friendly platform, constructive feedback, relationship-building opportunities, organic stream increases, and playlist credibility. They mention concerns about pricing, genre mismatches, limited reach for specific music genres, and inconsistent campaign outcomes. Positive experiences range from playlist placements and stream growth to frustrations with high costs and ineffective genre targeting. Suggestions for improvements include refining the playlist matching process and enhancing the overall service to cater to a wider range of music genres. Some customers have seen successful results with playlist adds and increased exposure, while others have expressed disappointment in the lack of engagement and overall return on investm

## Formatting the data for better readability

In [149]:
def OLD_format_json_to_md_table(json_obj):
    # Extract fields
    keywords = json_obj['keywords']
    
    # Split keywords by their types
    keyword_types = keywords.split('\n')
    positive_keywords = keyword_types[0].replace('Positive keywords: ', '').split(', ')
    neutral_keywords = keyword_types[1].replace('Neutral keywords: ', '').split(', ')
    negative_keywords = keyword_types[2].replace('Negative keywords: ', '').split(', ')
    
    # Calculate the max number of keywords in any category to ensure the table is complete
    max_length = max(len(positive_keywords), len(neutral_keywords), len(negative_keywords))
    
    # Prepare the markdown table header
    markdown_table = "\t" + "| Positive Keywords | Neutral Keywords | Negative Keywords |\n"
    markdown_table += "\t" + "|-------------------|------------------|-------------------|\n"
    
    # Fill the table rows
    for i in range(max_length):
        pos_keyword = positive_keywords[i] if i < len(positive_keywords) else ""
        neu_keyword = neutral_keywords[i] if i < len(neutral_keywords) else ""
        neg_keyword = negative_keywords[i] if i < len(negative_keywords) else ""
        markdown_table += "\t" + f"| {pos_keyword} | {neu_keyword} | {neg_keyword} |\n"
    
    return markdown_table


def OLD_format_json_to_md_document():
    main_path = os.path.join(script_dir, "GPT_generated_data", "raw_data")
    output_md_path = os.path.join(script_dir, "GPT_generated_data", "final_document.md")
    document = "# GPT Generated Summaries and Keywords\n\n"
    with open(os.path.join(main_path, "reddit_submission_keywords.json"), 'r') as file:
        reddit_review_keywords = json.load(file)
    with open(os.path.join(main_path, "reddit_comment_keywords.json"), 'r') as file:
        reddit_comment_keywords = json.load(file)
    with open(os.path.join(main_path, "trustpilot_review_keywords.json"), 'r') as file:
        trustpilot_review_keywords = json.load(file)

    website_list = get_websites()
    for url in website_list:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1)
        document += f"## {company}\n\n"
        temp_document = ""
        for obj in reddit_review_keywords:
            if company == obj["company"]:
                temp_document += "### Reddit submissions\n\n"
                temp_document += '- >' + obj['summary'] + "\n\n"
                temp_document += format_json_to_md_table(obj) + "\n\n"
        for obj in reddit_comment_keywords:
            if company == obj["company"]:
                temp_document += "### Reddit comments\n\n"
                temp_document += '- >' + obj['summary'] + "\n\n"
                temp_document += format_json_to_md_table(obj) + "\n\n"
        for obj in trustpilot_review_keywords:
            if company == obj["company"]:
                temp_document += "### Trustpilot reviews\n\n"
                temp_document += '- >' + obj['summary'] + "\n\n"
                temp_document += format_json_to_md_table(obj) + "\n\n"
        if temp_document != "":
            document += temp_document
        else:
            document += "Insuffient data.\n\n"
        document += "---\n\n"

    with open(output_md_path, "w") as file:
        file.write(document)
    return document


In [158]:
def format_json_to_md_table(json_obj):
    # Extract fields
    positive_keywords = json_obj['positive_keywords']
    neutral_keywords = json_obj['neutral_keywords']
    negative_keywords = json_obj['negative_keywords']
    
    # Calculate the max number of keywords in any category to ensure correct table format
    max_length = max(len(positive_keywords), len(neutral_keywords), len(negative_keywords))
    
    # Prepare the markdown table header
    markdown_table = "| Positive Keywords | Neutral Keywords | Negative Keywords |\n"
    markdown_table += "|-------------------|------------------|-------------------|\n"
    
    # Fill the table rows
    for i in range(max_length):
        pos_keyword = positive_keywords[i] if i < len(positive_keywords) else ""
        neu_keyword = neutral_keywords[i] if i < len(neutral_keywords) else ""
        neg_keyword = negative_keywords[i] if i < len(negative_keywords) else ""
        markdown_table += f"| {pos_keyword} | {neu_keyword} | {neg_keyword} |\n"
    
    return markdown_table


def format_json_to_md_document():
    main_path = os.path.join(script_dir, "GPT_generated_data", "new_summaries", "raw_data")
    output_md_path = os.path.join(script_dir, "GPT_generated_data", "new_summaries", "final_document.md")
    document = "# GPT-4-Turbo-0125 Generated Summaries and Keywords from Reddit Posts/Comments\n\n"
    with open(os.path.join(main_path, "reddit_summary_keywords.json"), 'r') as file:
        reddit_summary_keywords = json.load(file)

    website_list = get_websites()
    for url in website_list:
        company = re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1)
        document += f"## {company}\n\n"
        temp_document = ""
        for json_obj in reddit_summary_keywords:
            if company == json_obj["company"]:
                temp_document += '>' + json_obj['summary'] + "\n\n"
                temp_document += format_json_to_md_table(json_obj) + "\n\n"
        if temp_document != "":
            document += temp_document
        else:
            document += "Insuffient data.\n\n"
        document += "---\n\n"

    with open(output_md_path, "w") as file:
        file.write(document)
    return document


In [157]:
format_json_to_md_document()

"# GPT-4-Turbo-0125 Generated Summaries and Keywords from Reddit Posts/Comments (combined)\n\n## playlistpush\n\n>Customers share a mixed array of experiences and opinions concerning PlaylistPush, ranging from high appreciation for its potential to boost streams and the legitimacy of its curators to criticisms over the service’s pricing and campaign effectiveness. Positive testimonials highlight successful campaigns that significantly increased streams and followers, praising the company for its ability to facilitate real, organic engagement and high-quality curator reviews that suggest real listening took place. Contrarily, some users criticize the service for the high cost relative to the perceived value return, with particular note on experiences where the increase in streams or followers did not meet expectations, and concerns over inactive or bot followers in affiliated playlists. Customers value clear evidence of PlaylistPush’s active quality control measures, like banning underp

In [114]:
json_path = os.path.join(script_dir, "GPT_generated_data", "raw_data", "reddit_comment_keywords.json")

with open(json_path, 'r') as file:
    data = json.load(file)

## Example -- Summary of PlaylistPush Reddit posts

**Cost**: $0.004

**Prompt:** Analyze these Reddit posts about PlaylistPush. And generate a paragraph-long summary that focuses on customer opinions and experiences without naming any other companies. Highlight what customers appreciate and any concerns they have expressed. Use phrases like 'Customers appreciate...' and 'They mention concerns about...' to ensure the summary is customer-centric. Additionally, extract the top keywords from the summary and categorize them into a short list of pros and cons. Do not make up any information or referring to specific alternatives.

**Summary**: 'Customers appreciate PlaylistPush for its convenience and the potential to reach a broader audience. They like the quality of playlists and the opportunity to receive feedback on their tracks. Concerns have been raised about the high cost of campaigns, lack of transparency in the curator selection process, and the quality of the playlists and feedback received. Some customers feel that the results may not always be delivered as promised, and there is a risk of being placed in low-quality or inactive playlists. Overall, customers have mixed experiences with PlaylistPush, with some finding success and others facing challenges with the service.'}