<a href="https://colab.research.google.com/github/raminass/tau-digital/blob/main/notebooks/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

In [1]:
!pip install --upgrade openai --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.5/220.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0m

In [2]:
OPENAI_API_KEY = "" # @param {type:"string"}
from openai import OpenAI
import pandas as pd
import re
import numpy as np
from bs4 import BeautifulSoup

client = OpenAI(api_key=OPENAI_API_KEY)

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    result = re.sub(pattern, ' ', text)
    return result

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   # try if the request sussessful if not return None
   try:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
   except:
        return None

def html_to_text(html_code):
    # Parse HTML
    soup = BeautifulSoup(html_code, 'html.parser')

    # Extract text content
    text_content = soup.get_text(separator='\n', strip=True)
    return text_content


# Build Embedding

In [None]:
Raw_Data = "https://github.com/raminass/tau-digital/blob/main/data/bio_forum.csv?raw=true" # @param ["https://github.com/raminass/tau-digital/blob/main/data/bio_forum.csv?raw=true", "https://github.com/raminass/tau-digital/blob/main/data/calc_forum.csv?raw=true", "https://github.com/raminass/tau-digital/blob/main/data/ds_exam.csv?raw=true"]
data_name = Raw_Data.split('/')[-1].split('.')[0]
# read raw data
df=pd.read_csv(Raw_Data)
# clean the data
df['clear_text'] = df.message.apply(lambda x: html_to_text(x))
# get the embedding
df['msg_embedding'] = df.message.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#remove rows with None values
df = df[df['msg_embedding'].notna()]
# save the embeddings
df.to_csv(f'embedded_{data_name}.csv', index=False)

# TSNE + Kmeans

In [4]:
# @title
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.cluster import KMeans

def analyse_embd(embedding_file, n_clusters=4):
  file_name = embedding_file.split('/')[-1].split('.')[0]
  df=pd.read_csv(embedding_file)
  #remove rows with None values
  df = df[df['msg_embedding'].notna()]
  #get feature matrix from embeddings
  features = np.array(df.msg_embedding.apply(eval).to_list())
  #project the embed to 2d using TSNE
  tsne = TSNE(n_components=2, random_state=0)
  projections = tsne.fit_transform(features)
  df['x'] = projections[:,0]
  df['y'] = projections[:,1]
  #perform k-means
  kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
  kmeans.fit(features)
  labels = kmeans.labels_
  df["Cluster"] = labels.astype(str)
  fig = px.scatter(
    df, x='x', y='y',
    hover_name="clear_text", color="Cluster", title=file_name
  )
  fig.show()
  return df, fig

## Biochemistrey

In [8]:
embedding_file = "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_bio_forum.csv" # @param ["https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_bio_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_calc_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/ds_exam.csv"]
df_bio, fig_bio = analyse_embd(embedding_file)





In [20]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

# ref: https://cookbook.openai.com/examples/clustering
# Reading a review which belong to each group.
rev_per_cluster = 5
df = df_bio
n_clusters = len(df.Cluster.unique())
for i in range(n_clusters):
    print(f"Cluster {i} Theme:\n", end=" ")

    reviews = "\n".join(
        df[df.Cluster.astype(int) == i]
        .message.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=41)
        .values
    )
    # https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api-a-few-tips-and-tricks-on-controlling-the-creativity-deterministic-output-of-prompt-responses/172683
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "assistant", "content": f'You are a friendly and helpful teaching assistant in a biochemistry course, the course is in university level for biology students. You are helping students with their questions, you can use material from Nelson & Cox / Lehninger - principles of biochemistry, 6’th edition'},
                {"role": "user", "content": f'What do the following students questions in biochemistry course have in common?\n\nStudent questions:\n"""\n{reviews}\n"""\n\nTheme:'},],
        temperature=0.7,
        max_tokens=64,
        top_p=0.5,
        # frequency_penalty=0,
        # presence_penalty=0,
    )
    print(response.choices[0].message.content.replace(". ", ".\n"))


Cluster 0 Theme:
 The common theme in these student questions is that they are seeking clarification or assistance with specific topics or concepts in biochemistry.
They are asking for help with understanding calculations, interpreting results, and reconciling discrepancies between what they have learned in class and what they are observing in their assignments or exams.
Cluster 1 Theme:
 The common theme among these student questions is that they are all related to specific topics or concepts in biochemistry.
The first question is about the splitting observed in a graph, the second question is about the role of low glucose affinity in the liver, the third question is about the addition of weak acid to mitochondria, and
Cluster 2 Theme:
 The common theme in these student questions is that they are seeking clarification or explanation on specific topics or questions related to biochemistry.
Some of the specific topics mentioned in the questions include the release of a protein from a me

## Calculus

In [19]:
embedding_file = "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_calc_forum.csv" # @param ["https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_bio_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_calc_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/ds_exam.csv"]
df_calc, fig_calc = analyse_embd(embedding_file)





In [21]:
client = OpenAI(api_key=OPENAI_API_KEY)

# ref: https://cookbook.openai.com/examples/clustering
# Reading a review which belong to each group.
rev_per_cluster = 5
df = df_calc
n_clusters = len(df.Cluster.unique())

for i in range(n_clusters):
    print(f"Cluster {i} Theme:\n", end=" ")

    reviews = "\n".join(
        df[df.Cluster.astype(int) == i]
        .message.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=42)
        .values
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "assistant", "content": f'You are a friendly and helpful teaching assistant in a calculus course, the course is in university level for computer science students. You are helping students with their questions, you can use material from R. Courant & F. John, Introduction to Calculus and Analysis I book.'},
                {"role": "user", "content": f'What do the following students questions in calculus course have in common?\n\nStudent questions:\n"""\n{reviews}\n"""\n\nTheme:'},],
        # temperature=0,
        # max_tokens=64,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
    )
    print(response.choices[0].message.content.replace(". ", ".\n"))


Cluster 0 Theme:
 The common theme in these student questions is that they all involve understanding and applying concepts related to calculus and analysis.
Specifically, the questions involve topics such as the sum of functions, inequalities, Taylor series, and convergence of integrals.
Cluster 1 Theme:
 The common theme in these student questions is that they are not related to calculus or math.
They include unrelated text and even a link to a WhatsApp group.
Cluster 2 Theme:
 The common theme among these student questions is that they are not related to calculus or mathematical concepts.
Instead, they involve administrative inquiries (such as requesting a syllabus) or seeking feedback on personal work.
Cluster 3 Theme:
 The common theme in these student questions is that they are all related to specific mathematical concepts and techniques in calculus.
The questions involve topics such as limits, derivatives, convergence, and proof techniques.


## Data Structures Exam

In [6]:
embedding_file = "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_ds_exam.csv" # @param ["https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_bio_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_calc_forum.csv", "https://raw.githubusercontent.com/raminass/tau-digital/main/data/embedded_ds_exam.csv"]
df_ds, fig_ds = analyse_embd(embedding_file)





In [7]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

# ref: https://cookbook.openai.com/examples/clustering
# Reading a review which belong to each group.
rev_per_cluster = 5
df = df_ds
n_clusters = len(df.Cluster.unique())

for i in range(n_clusters):
    print(f"Cluster {i} Theme:\n", end=" ")

    reviews = "\n".join(
        df[df.Cluster.astype(int) == i]
        .message.str.replace("Title: ", "")
        .str.replace("\n\nContent: ", ":  ")
        .sample(rev_per_cluster, random_state=42)
        .values
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "assistant", "content": f'You are a friendly and helpful teaching assistant in a data structures course, the course is in university level for computer science students. You are helping students with their questions, you can use material from Introduction to Algorithms book by Thomas H. Cormen.'},
                {"role": "user", "content": f'What do the following students questions in data structures course have in common?\n\nStudent questions:\n"""\n{reviews}\n"""\n\nTheme:'},],
        # temperature=0,
        # max_tokens=64,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
    )
    print(response.choices[0].message.content.replace(". ", ".\n"))


Cluster 0 Theme:
 The common theme among these student questions is that they are not related to data structures.
They seem to be unrelated statements or comments that may have been mistakenly submitted as questions.
Cluster 1 Theme:
 The common theme in these student questions is that they involve analysis and discussion of various algorithms and data structures.
The students are trying to understand and analyze different aspects of the algorithms, such as time complexity, correctness, and efficiency.
They also ask for feedback on their proposed solutions and seek clarification on certain concepts.
Cluster 2 Theme:
 The common theme in these student questions is that they all relate to specific topics or concepts in data structures.
They are seeking clarification or guidance on certain aspects such as understanding the complexity of an algorithm (amortized complexity), the correct usage of certain methods (such as 'get' and 'init'), or even requesting assistance with a specific questi