In [None]:
pip install -q openai

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import numpy as np
import pandas as pd
from ast import literal_eval

df = pd.read_csv("search_results.csv", converters={'clean': literal_eval, 'bert_sim': literal_eval, 'tfidf_sim': literal_eval})

In [None]:
clusters = []
for sim_scores in df.bert_sim:
  if len(sim_scores) >= 100:
    X = np.array(sim_scores)
    kmeans = KMeans(n_clusters=30, random_state=0, n_init="auto").fit(X.reshape(-1,1))
  else:
    X = np.array(sim_scores)
    kmeans = KMeans(n_clusters=10, random_state=0, n_init="auto").fit(X.reshape(-1,1))
  clusters.append(kmeans.labels_)

df['k_mean_clusters'] = clusters

In [None]:
from openai import OpenAI
from collections import defaultdict
from tqdm import tqdm

client = OpenAI()

gpt_responses = []
for sentences, labels in tqdm(zip(df.clean, df.k_mean_clusters)):
  k_clusters = defaultdict(list)
  for res, label in zip(sentences, labels):
    # Assign title to corresponding cluster
    k_clusters[label].append(res)

  response_clusters = []
  for key, val in k_clusters.items():
    if len(val) < 5:
      # Do not process if cluster contains less than 5 titles
      response_clusters.append(None)
      continue

    prompt = f"""The task is to generate questions based on provided information.
    Given list of texts generate only two questions, no more than two questions. 
    Make questions variant.
    The questions should be what a user is looking for and not questions that seek to do fact-checking.

    Format output as a python list.

    Information:
    ```{val}```"""

    response = client.chat.completions.create(
      model="gpt-4",
      messages=[
        {"role": "assistant", "content":prompt}
      ]
    )

    response_clusters.append(response.choices[0].message.content)
  gpt_responses.append(response_clusters)

In [None]:
df['gpt_questions'] = gpt_responses