In [12]:
import os
import re
import json
import time
import tiktoken
import pandas as pd
from urllib.parse import urlparse
from openai import AzureOpenAI

script_dir = os.path.dirname(os.getcwd())

In [13]:
# OpenAI API Configuration
MAX_TOKENS = 128000  # Max total tokens for gpt-4-0125-preview
MAX_OUTPUT = 4096
MAX_INPUT = MAX_TOKENS - MAX_OUTPUT
TOKENS_PER_MINUTE_LIMIT = 70000  # TPM rate limit
REQUESTS_PER_MINUTE_LIMIT = 470
model = "gpt-4-TPM-70k-RPM-420"  # model = "deployment_name".
input_cost = 0.01  # Per 1k tokens
output_cost = 0.03  # Per 1k tokens


# client = AzureOpenAI(
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
#     api_version="2024-02-15-preview",
#     azure_endpoint = os.getenv("AZURE_OPENAI_LANGUAGE_ENDPOINT")
#     )


# response = client.chat.completions.create(
#     model="gpt-4-turbo", # model = "deployment_name".
#     messages=[
#         {"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
#         {"role": "user", "content": "Who were the founders of Microsoft?"}
#     ]
# )

# #print(response)
# print(response.model_dump_json(indent=2))
# print(response.choices[0].message.content)

In [15]:
def save_json(output_path, new_data):
    temp_file_path = output_path.strip(".json")+"_temp.json"
    with open(temp_file_path, 'w') as temp_file:
        json.dump(new_data, temp_file, indent=4)

    # Replace the old file with the new file
    os.replace(temp_file_path, output_path)


def get_websites():
    filepath = os.path.join(script_dir, "scraped_data", "company_data", "music_services2.csv")
    music_services = pd.read_csv(filepath)
    music_services = music_services['music_services'].tolist()
    return music_services


def estimate_num_tokens_from_str(string, model="gpt-4-turbo"):
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: Encoding not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    #print(string)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def get_cost(usage_obj, input_cost=0.0005, output_cost=0.0015):
    return ((usage_obj.prompt_tokens/1000) * input_cost) + ((usage_obj.completion_tokens/1000) * output_cost)

In [20]:
def get_gpt4turbo_summary(overwrite=False):

    # Initializing variables
    last_request_time = time.time()
    total_tokens_this_session, tokens_this_minute, total_cost_this_session, requests_this_minute, total_requests_this_session = 0, 0, 0, 0, 0
    json_path = os.path.join(script_dir, "GPT_generated_data", "summary", "raw_data", "reddit_summary_keywords2.json")

    # Getting list of companies
    url_list = get_websites() ################################## edit this
    #url_list = ["https://soundbetter.com/"]
    company_names = [re.search(r"(?:www\.)?(.*?)\.\w+$", urlparse(url).netloc).group(1) for url in url_list]

    # If JSON already exists and overwrite is FALSE, load file
    if os.path.exists(json_path) and not overwrite:
        with open(json_path, 'r') as file:
            json_data = json.load(file)
        skip_list = [entry.get("company") for entry in json_data]  # Check which companies already have a summary
        company_names = [company for company in company_names if company not in skip_list]
    else:
        json_data = []
    
    # for company in company_names:
    #     # Import reddit data sets and remove irrelevant rows
    #     submissions_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/submissions/train/{company}_train.csv")
    #     comments_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/comments/train/{company}_train.csv")
    #     if os.path.exists(submissions_filepath) and os.path.exists(comments_filepath):
    #         sub_df = pd.read_csv(submissions_filepath, usecols = ['combined', 'gpt_filter2'], lineterminator='\n')
    #         sub_df = sub_df[sub_df["gpt_filter2"] == True]
    #         comment_df = pd.read_csv(comments_filepath, usecols = ['body_clean', 'gpt_filter2'], lineterminator='\n')
    #         comment_df = comment_df[comment_df["gpt_filter2"] == True]
    #     else:
    #         print("DEBUG: continue")
    #         continue
        
    for company in company_names:
        # Import reddit data sets and remove irrelevant rows
        submissions_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/submissions/clean/{company}_clean.csv")
        comments_filepath = os.path.join(script_dir, f"scraped_data/reddit_data/comments/clean/{company}_clean.csv")
        if os.path.exists(submissions_filepath) and os.path.exists(comments_filepath):
            sub_df = pd.read_csv(submissions_filepath, lineterminator='\n')
            sub_df['combined'] = sub_df['title_clean'] + ': ' + sub_df['content_clean']
            sub_df = sub_df[['combined']]
            comment_df = pd.read_csv(comments_filepath, usecols = ['body_clean'], lineterminator='\n')
        else:
            #print("DEBUG: continue")
            continue

        # Combine all reddit posts and comments into a single list
        posts = [post for post in sub_df['combined'].dropna()]
        comments = [comment for comment in comment_df['body_clean'].dropna()]
        reviews = posts + comments

        summary = ""
        result = {}
        response = None
        prompt = f"""
        Analyze these Reddit posts and comments about a music PR/playlist promotion company named {company}. Generate a paragraph-long summary that focuses on customer opinions/concerns/experiences regarding {company}. Additionally, list out the most frequently mentioned aspects about the company, categorized as positive, negative, or neutral. Each aspect should be summarized in 1-2 words, ensuring that synonyms or similar variants are consolidated under a single term that best represents the sentiment expressed across mentions. For example, if "high costs" and "expensive" are used interchangeably but "high costs" is more common, use "high costs" for the negative aspects category. If an aspect could be interpreted in multiple ways (positive, negative, neutral), categorize it based on the overall sentiment it most commonly aligns with in the context of these reviews. Avoid listing the same aspect or closely related aspects (including synonyms or near-synonyms) in more than one category. Return a JSON object consisting of "summary", "positive_aspects", "negative_aspects", and "neutral_aspects".
        
        Please note:
        - Keep in mind that some users may refer to multiple different services in the same post. Thus, only consider parts of the text that are explicitly referring to {company}. Ignore mentions of other services or irrelevant discussions.
        - Do not mention other companies or services directly in your summary. 
        - Avoid fabricating information or introducing unrelated topics.

        Example output (formatted as a valid JSON):
        {{
            "summary": "Customers appreciate <company name> for its user-friendly platform, constructive feedback, relationship-building opportunities, organic stream growth, and playlist credibility. However, they raise concerns about high pricing, genre mismatches, limited reach for certain music types, and inconsistent campaign outcomes, including ineffective genre targeting and disappointing return on investment. Suggestions for improvement include refining the playlist matching process and enhancing the service to accommodate a broader range of music genres, aiming to increase successful playlist adds and exposure.",
            "positive_aspects": ["User-friendly platform", "Constructive feedback", "Relationship-building", "Playlist credibility"],
            "negative_aspects": ["Pricing", "Reach", "Campaign outcomes", "Genre targeting", "Engagement", "Return on investment"],
            "neutral_aspects": ["Playlist placements", "Stream growth", "Exposure"]
        }}
        
        Here are the posts you will be analyzing: {reviews}

        \n```json
        """

        # Calculating estimated number of tokens
        estimate = estimate_num_tokens_from_str(prompt, model) + 7  # +1 for 'role', +6 for message primer
        for review in reviews:
            estimate += estimate_num_tokens_from_str(review, model)

        # If estimated input tokens exceeds limit, throw warning
        if estimate > MAX_INPUT:
            raise ValueError(f"WARNING: Estimated number of input tokens for {company} is {estimate}, which potentially exceeds the limit of {MAX_INPUT} tokens.")

        # Check if this request would exceed TPM or RPM limit
        current_time = time.time()
        if ((tokens_this_minute + estimate) > TOKENS_PER_MINUTE_LIMIT) or ((requests_this_minute + 1) > REQUESTS_PER_MINUTE_LIMIT):
            sleep_time = 60 - (current_time - last_request_time) + 2  # 2 sec buffer
            if sleep_time > 0:
                time.sleep(sleep_time)
            tokens_this_minute = 0  # Reset token count for the new minute
            requests_this_minute = 0
            last_request_time = time.time()  # Reset last request time

        # Generate summary
        client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version="2024-02-15-preview",
        azure_endpoint = os.getenv("AZURE_OPENAI_LANGUAGE_ENDPOINT")
        )

        response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt + f"{reviews}"}],
                max_tokens=MAX_OUTPUT,
                stop=None,
                n=1,
                response_format= {
                    "type": "json_object"
                    }
            )
        

        if response:
            generated_json = response.choices[0].message.content
            print(generated_json)
            generated_json = json.loads(generated_json)
            generated_summary = generated_json['summary']
            generated_positive_keywords = generated_json['positive_aspects']
            generated_negative_keywords = generated_json['negative_aspects']
            generated_neutral_keywords = generated_json['neutral_aspects']
            summary = generated_summary if generated_summary else summary

            tokens_this_minute += response.usage.total_tokens
            total_tokens_this_session += tokens_this_minute
            total_cost_this_session += get_cost(response.usage, input_cost, output_cost)
            requests_this_minute += 1
            total_requests_this_session += 1

        # Save response object as JSON
        if response:
            result = {
                "summary_id": response.id,
                "model": response.model,
                "created": response.created,
                "total_tokens_used": response.usage.total_tokens,
                "cost": get_cost(response.usage, input_cost, output_cost),
                "company": company,
                "summary": summary,
                "positive_keywords": generated_positive_keywords,
                "negative_keywords": generated_negative_keywords,
                "neutral_keywords": generated_neutral_keywords,
            }
            json_data.append(result)
            save_json(json_path, json_data)

            print(f"####### SUMMARY FOR {company} #######")
            print(summary)
            print("####### DEBUG PURPOSES #######")
            print(f"Estimated input tokens used: {estimate}") 
            print(f"Actual input tokens used: {response.usage.prompt_tokens}")
            print("####### CURRENT USAGE #######")
            print(f"Company: {company}")
            print(f"Tokens used for this summary: {response.usage.total_tokens}")
            print(f"Cost of this summary: {get_cost(response.usage, input_cost, output_cost)}")
            print("####### SESSION STATS #######")
            print(f"Total tokens used so far: {total_tokens_this_session}")
            print(f"Total requests so far: {total_requests_this_session}")
            print(f"Total cost so far: {total_cost_this_session}")
            print()


In [21]:
get_gpt4turbo_summary()

{
    "summary": "Broadjam is a mixed bag for its users, who express varied experiences and opinions. Some users find it to be a useful platform for submitting music to sync license opportunities and have achieved successful placements. There are mentions of utilizing Broadjam to connect with libraries, industry professionals, and even securing deals with TV shows like CSI. However, skepticism about the site's legitimacy is evident, with users referring to it as 'broadscam' and criticizing it for high submission costs without guaranteed follow-ups or placements. The interface is described as dated, and while it offers free song postings, the strategies for exposure are perceived as lackluster compared to other services. Overall, the sentiment leans towards a cautious approach when utilizing Broadjam, with a recommendation to explore more reputable publishers or directly approach music supervisors for better chances of success.",
    "positive_aspects": ["Successful placements", "Indust

ValueError: WARNING: Estimated number of input tokens for kwork is 329631, which potentially exceeds the limit of 123904 tokens.