<a href="https://colab.research.google.com/github/raminass/tau-digital/blob/main/notebooks/cal_emb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embedding

In [None]:
from openai import OpenAI
import pandas as pd
import re
import numpy as np
from bs4 import BeautifulSoup

from dotenv import load_dotenv, find_dotenv
from pathlib import Path
load_dotenv(Path(".env"))

client = OpenAI()

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    result = re.sub(pattern, ' ', text)
    return result

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   # try if the request sussessful if not return None
   try:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
   except:
        return None

def html_to_text(html_code):
    # Parse HTML
    soup = BeautifulSoup(html_code, 'html.parser')

    # Extract text content
    text_content = soup.get_text(separator='\n', strip=True)
    return text_content

In [None]:
data_name = 'calc_forum'
# read raw data
df=pd.read_csv(f'{data_name}.csv')
# remove empty messages
df = df[df['message'].notna()]
# clean the data
df['clear text'] = df.message.apply(lambda x: html_to_text(x))
# get the embedding
df['msg_embedding'] = df.message.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#remove rows with None values
df = df[df['msg_embedding'].notna()]
# save the embeddings
df.to_csv(f'embedded_{data_name}.csv', index=False)

# Read Embeddings

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv(f'embedded_{data_name}.csv')
#remove rows with None values
df = df[df['msg_embedding'].notna()]

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np


features = np.array(df.msg_embedding.apply(eval).to_list())

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)

df['x'] = projections[:,0]
df['y'] = projections[:,1]

# fig = px.scatter(
#     df, x='x', y='y',
#     hover_name="message"
# )
# fig.show()

## 1. Find the clusters using K-means

We show the simplest use of K-means. You can pick the number of clusters that fits your use case best.

In [None]:
from sklearn.cluster import KMeans

n_clusters = 4

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(features)
labels = kmeans.labels_
df["Cluster"] = labels.astype(str)





In [None]:
fig = px.scatter(
    df, x='x', y='y',
    hover_name="clear text", color="Cluster", title="Calculus"
)
fig.show()

In [None]:
from openai import OpenAI
client = OpenAI()

# Reading a review which belong to each group.
rev_per_cluster = 10

for i in range(n_clusters):
    print(f"Cluster {i} Theme:\n", end=" ")

    reviews = "\n".join(
        df[df.Cluster.astype(int) == i]
        .message.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=42)
        .values
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "assistant", "content": f'You are a friendly and helpful teaching assistant in a calculus course, the course is in university level for computer science students. You are helping students with their questions, you can use material from R. Courant & F. John, Introduction to Calculus and Analysis I book.'},
                {"role": "user", "content": f'What do the following students questions in calculus course have in common?\n\nStudent questions:\n"""\n{reviews}\n"""\n\nTheme:'},],
        # temperature=0,
        # max_tokens=64,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
    )
    print(response.choices[0].message.content.replace(". ", ".\n"))


Cluster 0 Theme: The common theme in these student questions is that they are all seeking clarification or further explanation on various concepts and problems related to calculus. These include understanding function properties, proving inequalities, understanding the intuition behind certain mathematical choices, dealing with sequences and series, and understanding continuity in metric spaces.
Cluster 1 Theme: The common theme in these student questions is that they are all related to understanding and solving problems in a calculus course. They involve seeking clarification on concepts, asking for help in problem-solving, and discussing course logistics.
Cluster 2 Theme: The common theme in these student questions is that they are all related to understanding and clarifying concepts, principles, or problems in a calculus course. They are seeking help with proofs, understanding the necessity of certain conditions in mathematical theorems, and asking for course materials and solutions