# Requirements

In [None]:
!pip install openai

# Load Data

In [None]:
from google.colab import drive
import json

# Mount drive
drive.mount('/content/drive')

# Get JSON
file_path = '/content/drive/My Drive/tegus_interview/ml_exercise_conversation.cleaned.json'

with open(file_path, 'r') as file:
    json_data = json.load(file)

In [3]:
file_path = '/content/drive/My Drive/tegus_interview/secrets/openai_api_key.txt'

# Open the file and read the API key
with open(file_path, 'r', encoding='utf-8-sig') as file:
    api_key = file.read().strip()

# Get JSON Structure

In [None]:
def print_json_structure(data, indent=0):
    """Prints the structure of a JSON object with indentation corresponding to nesting level."""

    # if current value is a dictionary
    if isinstance(data, dict):
        for key, value in data.items():
            print(' ' * indent + f"Key: {key}, Type: {type(value)}")
            print_json_structure(value, indent + 4) # recusrive call

    # if current value is a list
    elif isinstance(data, list):
        print(' ' * indent + f"List containing {len(data)} items, Type of first item: {type(data[0]) if data else 'Empty List'}")
        if data:
            print_json_structure(data[0], indent + 4) # recursive call

# Call the function with the top-level JSON data
print_json_structure(json_data)


# Chop up Input to Digestible Q and A (No ML Used)

In [5]:
list_of_dialogue = json_data["utterances"]

paragraphs = []
for excerpt in list_of_dialogue:
  if "Client" in excerpt["speaker"]:
    speaker = "Interviewer"
  else:
    speaker = "Expert"

  paragraphs.append(speaker + ": " + ('--\n').join(excerpt["paragraphs"]))


# Function to extract the speaker's name from a string
def get_speaker(s):
    return s.split(":")[0]

# Initialize an empty list for combined elements
combined_paragraphs = []

for i in range(0, len(paragraphs)-1, 2):
    if i+1 < len(paragraphs) and get_speaker(paragraphs[i]) != get_speaker(paragraphs[i+1]):

        # Combine the current and next elements
        combined_paragraph = paragraphs[i] + "\n\n" + paragraphs[i+1]
        combined_paragraphs.append(combined_paragraph)

# Find any companies and products mentioned. Assign competitor status

In [None]:
from openai import OpenAI
client = OpenAI(api_key = api_key)


# get companies and products
api_key = str(api_key)
analysis1 = []
for q_and_a in combined_paragraphs:
    response = client.chat.completions.create(
      model="gpt-4-0613",
      messages=[
          {"role": "system", "content": "You are a helpful assistant"},
          {"role": "user", "content": q_and_a},
          {"role": "user", "content": "Return a list of any companies or products mentioned"},
          {"role": "user", "content": "Please assign each element of the list as either a direct competitor of Snowflake or not"},
          {"role": "user", "content": "Simply return {product/company} - Competitor/Not Competitor"}
      ]
    )
    rsp = response.choices[0].message.content
    analysis1.append(rsp)
    print(rsp)
comps = '\n'.join(analysis1)


# organize into list
response = client.chat.completions.create(
  model="gpt-4-0613",
  messages=[
      {"role": "system", "content": "You are a helpful assistant. You are going to receive Instructions and then Content"},
      {"role": "user", "content": "INSTRUCTIONS - I am going to send you text with repeating information. Use that text to create a list like this {company/product} - competitor/not competitor"},
      {"role": "user", "content": "CONTENT -" + " " + comps}
  ]
)
comps_organized = response.choices[0].message.content

# clean (remove repeats and non-products/non-companies)
response = client.chat.completions.create(
  model="gpt-4-0613",
  messages=[
      {"role": "system", "content": "You are a helpful assistant"},
      {"role": "user", "content": "I'm going to send you a list of products/companies. Get rid of repeats"},
      {"role": "user", "content": comps_organized},
      {"role": "user", "content": "Get rid of things that aren't actually a company or a specific product (including that which is a category of tech)"},
      {"role": "user", "content": "Retain (but double-check) their classification as competitor or not"}
  ]
)
comps_cleaned_and_organized = response.choices[0].message.content
print(comps_cleaned_and_organized)




# Further Verification

In [None]:
response = client.chat.completions.create(
  model="gpt-4-0613",
  messages=[
      {"role": "system", "content": "You are a helpful assistant"},
      {"role": "user", "content": "I'm going to send you a list of products and companies mixed."},
      {"role": "user", "content": comps_cleaned_and_organized},
      {"role": "user", "content": "Delete items that are merely the parent company of another product on the list (i.e. REMOVE OpenAI if ChatGPT was in the list)"},
      {"role": "user", "content": "Print new list with those items removed."},
      {"role": "user", "content": "Also, combine products that refer to the same product (for example, you would combine BigMac and McDonalds BigMac)"},
      {"role": "user", "content": "Please enter in format {company} - \"YES\" or \"NO\" for is a competitor"},
  ]
)
final_companies_products = response.choices[0].message.content
print(final_companies_products)


# Parse the input list and extract competitors and non-competitors
competitors = []
non_competitors = []

# Split the input list into lines
lines = final_companies_products.strip().split('\n')

# Loop through each line
for line in lines:

    if "snowflake" in line.lower():
      continue

    # Check if line has 'yes'
    if 'yes' in line.lower():
        # Extract the name and add it to the competitors list
        name = line.split('-')[0].split('.', 1)[-1].strip()
        competitors.append(name)
    else:
        # Extract the name and add it to the non-competitors list
        name = line.split('-')[0].split('.', 1)[-1].strip()
        non_competitors.append(name)
competitors_string = str(competitors)
print(competitors_string)

# Importance Weighing

In [None]:
import ast

# find how many times the company is mentioned in the given question and response
competitors_freq = {competitor: 0 for competitor in competitors}
for q_and_a in combined_paragraphs:
    response = client.chat.completions.create(
      model="gpt-4-0613",
      messages=[
          {"role": "system", "content": "You are a helpful assistant. If requested to return a python object, only return the python object and no annotation"},
          {"role": "user", "content": f"Do you see the list here (called PROD_OPTIONS): {competitors_string}"},
          {"role": "user", "content": f"You are to analyze the following interview to see if any elements of PROD_OPTIONS are mentioned directly or indirectly"},
          {"role": "user", "content": q_and_a},
          {"role": "user", "content": f"Return any elements of PROD_OPTIONS that were mentioned and how many times"},
          {"role": "user", "content": "Make sure you only are returning items that are members of PROD_OPTIONS"},
          {"role": "user", "content": "Your only response should be the python dictionar requested"},

      ]
    )
    rsp = response.choices[0].message.content
    print(rsp)
    if "none" not in rsp.lower() and rsp.lower() != "{}":
      rsp_dict = eval(rsp)
      for competitor, value in rsp_dict.items():
        if competitor in competitors_freq:
          competitors_freq[competitor] += value


print(competitors_freq)

# Further Verify Non-Competitors

In [9]:
non_competitors_str = str(non_competitors)

# for non competitors, make sure they are actually a company or product (and not a concept)
response = client.chat.completions.create(
  model="gpt-4-0613",
  messages=[
      {"role": "system", "content": "You are a helpful assistant"},
      {"role": "user", "content": f"I'm going to send you a list of items: {non_competitors_str}"},
      {"role": "user", "content": "Remove any from the list that are not actual companies or actual products related to tech"},
      {"role": "user", "content": "Return python list, no additional annotation or explanation"}
  ]
)
non_competitors = ast.literal_eval(response.choices[0].message.content)

# Final Printing

In [None]:
# Sort competitors by their value in ascending order
sorted_competitors = sorted(competitors_freq.items(), key=lambda item: item[1], reverse=True)

def assign_ranks(sorted_competitors):
    rank = 1
    previous_value = None
    results = []

    for item, value in sorted_competitors:
        # If two items have the same value, they share the same rank
        if value != previous_value:
            previous_value = value
            current_rank = rank
        results.append((item, current_rank, "Competitor"))
        rank += 1

    return results

# Assign ranks to competitors
ranked_competitors = assign_ranks(sorted_competitors)

print("COMPETITORS")
# Print competitors with ranks
for item, rank, status in ranked_competitors:
    print(f"{item} - {rank} - {status}")

print()
print("NON-COMPETITORS:")
# Print non-competitors with a rank of 0
for item in non_competitors:
    print(f"{item} - Non-Competitor")