# Embedding

In [5]:
from openai import OpenAI
import pandas as pd
import re
import numpy as np
                                                                                                                                                                             
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
load_dotenv(Path(".env"))

client = OpenAI()

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    result = re.sub(pattern, ' ', text)
    return result

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   # try if the request sussessful if not return None
   try:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
   except:
        return None

In [6]:
data_name = 'ds_exam'
# read raw data
df=pd.read_csv(f'{data_name}.csv')
# remove empty messages
df = df[df['message'].notna()]
# clean the data
df['clear text'] = df.message.apply(lambda x: remove_html_tags(x))
# get the embedding
df['msg_embedding'] = df.message.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#remove rows with None values
df = df[df['msg_embedding'].notna()]
# save the embeddings
df.to_csv(f'embedded_{data_name}.csv', index=False)

# Read Embeddings

In [7]:
import pandas as pd
import numpy as np
df=pd.read_csv(f'embedded_{data_name}.csv')
#remove rows with None values
df = df[df['msg_embedding'].notna()]

In [8]:
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np


features = np.array(df.msg_embedding.apply(eval).to_list())

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)

df['x'] = projections[:,0]
df['y'] = projections[:,1]

# fig = px.scatter(
#     df, x='x', y='y',
#     hover_name="message"
# )
# fig.show()

## 1. Find the clusters using K-means

We show the simplest use of K-means. You can pick the number of clusters that fits your use case best.

In [11]:
from sklearn.cluster import KMeans

n_clusters = 3

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(features)
labels = kmeans.labels_
df["Cluster"] = labels.astype(str)





In [12]:
fig = px.scatter(
    df, x='x', y='y',
    hover_name="message", color="Cluster", title="Data Structures Exam"
)
fig.show()

In [16]:
from openai import OpenAI
client = OpenAI()

# Reading a question which belong to each group.
rev_per_cluster = 5

for i in range(n_clusters):
    print(f"Cluster {i} Theme:\n", end=" ")

    reviews = "\n".join(
        df[df.Cluster.astype(int) == i]
        .message.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=42)
        .values
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "assistant", "content": f'You are a friendly and helpful teaching assistant in a data structures course, the course is in university level for computer science students. You are helping students with their questions, you can use material from Introduction to Algorithms book by Thomas H. Cormen.'},
                {"role": "user", "content": f'What do the following students questions in data structures course have in common?\n\nStudent questions:\n"""\n{reviews}\n"""\n\nTheme:'},],
        # temperature=0,
        # max_tokens=64,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
    )
    print(response.choices[0].message.content.replace(". ", ".\n"))


Cluster 0 Theme:
 The common theme in these student questions is that they all involve discussions or analysis related to time complexity and efficiency of operations in data structures.
Specifically, they are discussing the time complexity of various operations, such as insert and delete, in different data structures such as AVL trees and heap data structures.
The students are also comparing different approaches to solve a problem and discussing their own proposed solutions.
Cluster 1 Theme:
 The common theme in these students' questions is that they are all seeking clarification or explanations regarding certain concepts or algorithms in the data structures course.
They are asking for help in understanding specific parts of the solutions or implementations of various data structures and algorithms.
Cluster 2 Theme:
 The students' questions all revolve around the topic of binary trees and the possibility of reconstructing a binary tree uniquely from its pre-order traversal.
They are s