# Topic Modelling

## Load the Data


Load the data, and preprocess to remove null values.

In [2]:
import pandas as pd

rubric_data_raw = pd.read_excel('../../raw_data/rubic_style.xlsx', sheet_name='RKT(1)', 
                            usecols='D:E', names=['clear_criteria', 'simplified_criteria'])

print(rubric_data_raw.head())

                                      clear_criteria  \
0  The Document should have title page (essential...   
1            Existing Title Page Section (essential)   
2                  Existing Name of the organization   
3  Existing Name of the internee, Student ID and ...   
4  Existing Submission date of the internship report   

                                 simplified_criteria  
0  *Existing Title Page Section\n*Existing Name o...  
1                                            No Need  
2                                            No Need  
3                                            No Need  
4                                            No Need  


In [3]:
# Get the node ids from the rubric data
node_ids = pd.read_excel('../../raw_data/rubic_style.xlsx', sheet_name='RKT(1)', 
                            usecols='B', names=['node_id'])
node_ids.head()

Unnamed: 0,node_id
0,1.0
1,1.1
2,1.2
3,1.3
4,1.4


In [4]:
# Merge node_ids with rubric_data
rubric_data = pd.concat([node_ids, rubric_data_raw], axis=1)
rubric_data.head()

Unnamed: 0,node_id,clear_criteria,simplified_criteria
0,1.0,The Document should have title page (essential...,*Existing Title Page Section\n*Existing Name o...
1,1.1,Existing Title Page Section (essential),No Need
2,1.2,Existing Name of the organization,No Need
3,1.3,"Existing Name of the internee, Student ID and ...",No Need
4,1.4,Existing Submission date of the internship report,No Need


In [5]:
rubric_data.clear_criteria

0      The Document should have title page (essential...
1                Existing Title Page Section (essential)
2                      Existing Name of the organization
3      Existing Name of the internee, Student ID and ...
4      Existing Submission date of the internship report
                             ...                        
114    It is required there is harmony between  the c...
115    The document should have the recommendation se...
116    The document should have the recommendation se...
117    The recommendation section content describe su...
118    The document should have Rerences Section (ess...
Name: clear_criteria, Length: 119, dtype: object

In [6]:
rubric_data = rubric_data.dropna(subset=['clear_criteria'])
rubric_data = rubric_data.fillna('No Need')


In [7]:
rubric_data.shape

(118, 3)

## First approach: Using Chat GPT Assistant API

### Initialize GPT Assistant
First, create the GPT assistant with refined instruction for our need

In [8]:
from dotenv import load_dotenv
load_dotenv()  # Load environment variables
import os

def save_to_env_file(key, value):
    with open(".env", "a") as f:  # Append mode
        f.write(f"{key}={value}\n")

In [9]:
from openai import OpenAI
client = OpenAI()

ASSISTANT_ID = os.getenv("ASSISTANT_ID")
ASSISTANT_ID


'asst_cEVEIn3qBEnNxyGmIvSDPWUW'

In [10]:
if not ASSISTANT_ID:
    assistant = client.beta.assistants.create(
    name="RubricBot",
    instructions="""Extract bullet points from the following text, ensuring all points are at the same level 
    and retain the original content without summarization or paraphrasing.""",  # Use the refined instructions
    model="gpt-4-0125-preview"
    )
    ASSISTANT_ID = assistant.id
    save_to_env_file("ASSISTANT_ID", ASSISTANT_ID)
    print(f"Assistant ID: {ASSISTANT_ID}")

Create a thread

In [11]:
THREAD_ID = os.getenv("THREAD_ID")
THREAD_ID

'thread_tZ8cTL0KgqOHRirjtVNIqYoL'

In [12]:
if not THREAD_ID:
    thread = client.beta.threads.create()
    THREAD_ID = thread.id
    save_to_env_file("THREAD_ID", THREAD_ID)
    print(f"Thread ID: {THREAD_ID}")

Function to add message to thread

In [13]:
def add_message_to_thread(message):
    return client.beta.threads.messages.create(
    thread_id=THREAD_ID,
    role="user",
    content=message
)

Function to make a run (with streaming)

In [14]:
from typing_extensions import override
from openai import AssistantEventHandler
 
# First, we create a EventHandler class to define
# how we want to handle the events in the response stream.
 
class EventHandler(AssistantEventHandler):    
  @override
  def on_text_created(self, text) -> None:
    print(f"\nassistant > ", end="", flush=True)
      
  @override
  def on_text_delta(self, delta, snapshot):
    print(delta.value, end="", flush=True)
      
  def on_tool_call_created(self, tool_call):
    print(f"\nassistant > {tool_call.type}\n", flush=True)
  
  def on_tool_call_delta(self, delta, snapshot):
    if delta.type == 'code_interpreter':
      if delta.code_interpreter.input:
        print(delta.code_interpreter.input, end="", flush=True)
      if delta.code_interpreter.outputs:
        print(f"\n\noutput >", flush=True)
        for output in delta.code_interpreter.outputs:
          if output.type == "logs":
            print(f"\n{output.logs}", flush=True)
 
# Then, we use the `create_and_stream` SDK helper 
# with the `EventHandler` class to create the Run 
# and stream the response.

def create_and_stream(instructions):
  
    with client.beta.threads.runs.create_and_stream(
      thread_id=THREAD_ID,
      assistant_id=ASSISTANT_ID,
      instructions=instructions,
      event_handler=EventHandler(),
    ) as stream:
        stream.until_done()

### Give the Assistant some examples

In [15]:
from sklearn.model_selection import train_test_split
X = rubric_data.drop(columns=['simplified_criteria'])
y = rubric_data['simplified_criteria']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, shuffle=False)

In [16]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(59, 2) (59, 2) (59,) (59,)


In [17]:
def add_context_messages(X_series, y_series):
    for index, input_sample in X_series.iterrows():  # Use enumerate for Series
        node_id, clear_criteria = input_sample
        simplified_output = y_series[index]
        message = f"**Example:**\n\n**Input:** {clear_criteria}\n**Output:** {simplified_output}\n"
        print(message)
        add_message_to_thread(message)

In [18]:
add_context_messages(X_train, y_train)  # No need for reset_index


**Example:**

**Input:** The Document should have title page (essential). 
(No title page, then -10%)
The title page of the report must include:
a. Name of the organization
b. Name of the internee, Student ID and session
c. Submission date of the internship report
d. Name of the University
**Output:** *Existing Title Page Section
*Existing Name of the organization
*Existing Name of the internee, Student ID and session
*Existing Submission date of the internship report
*Existing  Name of the University


**Example:**

**Input:** Existing Title Page Section (essential)
**Output:** No Need

**Example:**

**Input:** Existing Name of the organization
**Output:** No Need

**Example:**

**Input:** Existing Name of the internee, Student ID and session
**Output:** No Need

**Example:**

**Input:** Existing Submission date of the internship report
**Output:** No Need

**Example:**

**Input:** Existing Name of the University
**Output:** No Need

**Example:**

**Input:** The document shoud have ac

In [19]:
instructions = """Extract bullet points from the following text, 
ensuring all points are at the same level and 
retain the original content without summarization or paraphrasing."""

In [20]:

create_and_stream(instructions="Learn these examples")


assistant > No Need

### Get the Assistant answer

#### Get a random sample rubric

In [21]:
import random


test_index = random.randint(0, len(X_test)-1)

In [22]:
test_index = 40

In [39]:
criteria = """
for day 1-3 describe your emotion. for each days describe the mental health status, the weight, height.
"""
# criteria = X_test.iloc[test_index].clear_criteria
target_criteria = y_test.iloc[test_index]
print('input - criteria:', criteria)
print('====================')
print('output:', target_criteria)
print('====================')
prompt = " Input:\n" + criteria
print('prompt:', prompt)

input - criteria: 
for day 1-3 describe your emotion. for each days describe the mental health status, the weight, height.

output: No Need
prompt:  Input:

for day 1-3 describe your emotion. for each days describe the mental health status, the weight, height.



Get the model simpified version answer

In [40]:
add_message_to_thread(prompt)
task_instruction = """
Task: List the following points as individual bullet points at the same level, 
and retain the original content without summarization or paraphrasing.
Crucial: All bullet points must be at the same level. Absolutely no nested bullet points are allowed.
Special Case: If there is only one point to extract, 
output "No Need" instead of a bullet point.

**Desired Output Format:**

* Bullet point 1
* Bullet point 2
* ..."""

In [41]:
assistant_output = create_and_stream(instructions=task_instruction)


assistant > * For day 1, describe your emotion.
  * Describe the mental health status.
  * Describe the weight.
  * Describe the height.
* For day 2, describe your emotion.
  * Describe the mental health status.
  * Describe the weight.
  * Describe the height.
* For day 3, describe your emotion.
  * Describe the mental health status.
  * Describe the weight.
  * Describe the height.

Output of Gemini:
 
 * The Document should have a Work Sample Section
 * Work sample Section should cover 2 example of the student's work (such as news stories, articles, interviews, projects etc.)
* work Example 1 And work Example 2 sould cover these content:
short description of your role in that work sample
how you used the sample. 

* The Document should have a Work Sample Section
* Work sample Section should cover 2 example of the student's work (such as news stories, articles, interviews, projects etc.)
* work Example 1 And work Example 2 sould cover these content:
short description of your role in that work sample
how you used the sample.

In [26]:
from IPython.display import display  # For displaying rich HTML outputs


data = {
    'Input': [criteria],
    'Desired Output': [target_criteria],
    'Assistant Output': ["""- Essential Document Components:
  - Critical Analysis Section:
    - Coverage of theoretical concepts applied during the internship in the organization
    - Execution of an overall analysis of the organizationnt_output"""]
}

df = pd.DataFrame(data)

display(df) 

Unnamed: 0,Input,Desired Output,Assistant Output
0,"\n""The Document should have a Work Sample Sect...",No Need,- Essential Document Components:\n - Critical...


#### Custom sample criteria

In [27]:
sample_criteria = """
"The Document should have a Work Sample Section(essential)
Work sample Section should cover 2 example of the student's work (such as news stories, articles, interviews, projects etc.)
work Example 1 And work Example 2 sould cover these content:
short description of your role in that work sample
how you used the sample.
"
"""

prompt = " Input\n" + sample_criteria


In [28]:
add_message_to_thread(prompt)


Message(id='msg_Nk67ZVaPdq06FXOZpBz3YbWs', assistant_id=None, completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value=' Input\n\n"The Document should have a Work Sample Section(essential)\nWork sample Section should cover 2 example of the student\'s work (such as news stories, articles, interviews, projects etc.)\nwork Example 1 And work Example 2 sould cover these content:\nshort description of your role in that work sample\nhow you used the sample.\n"\n'), type='text')], created_at=1712122915, file_ids=[], incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_tZ8cTL0KgqOHRirjtVNIqYoL')

In [29]:
create_and_stream(instructions=task_instruction)


assistant > * The Document should have a Work Sample Section (essential).
* Work Sample Section should cover 2 examples of the student's work (such as news stories, articles, interviews, projects etc.).
* Work Example 1 and Work Example 2 should cover these contents:
  * Short description of your role in that work sample.
  * How you used the sample.

## Second Approach


In [30]:
# Example usage
input_text = """
1. Title Page (No title page, then -10%)
The title page of the report will include:
  a. Name of the organization
  b. Name of the internee, Student ID and session
  c. Submission date of the internship report
  d. Name of the University
  e. MQ logo

2. Acknowledgment (No title page, then -10%) 
In this section you acknowledge the help and support of all the people who helped you in completion of 
your internship and internship report.

3. Executive Summary (Mark 2.5%) 
Executive summary previews every section of the report in a short form. It can be called as micro image 
of the report. It helps the reader to get a quick glance at the report before reading it in detail. Everything 
important that you have done, discovered and concluded should be mentioned but briefly and concisely. 

4. Table of contents (No title page, then -10%) 
List the important headings and sub headings in the report with page numbers.  Also make a separate list 
of tables and figures in the table of contents if you have used any. 
5. Overview of the Organization [word limit: min. 500 words] (Mark 2.5%) 
a. Brief history  
b. Introduction of the organization 
c. Policy of the organization 
d. Competitors  
6. Organizational Structure [word limit: min. 500 words] (Mark 2.5%) 
a. Organizational Hierarchy chartp 
b. Number of employees 
c. Main offices 
d. Introduction of all the departments 
e. Comments on the organizational structure  

7. Plan of your internship program [word limit: min. 300 words] (Mark 2.5%) 
a. A brief introduction of the branch/ area office of the organization where you did your 
internship 
b. Starting and ending dates of your internship 
c. Part-time or Full-time 
d. Names of the departments in which you got training and the duration of your training 
8. Training Program [word limit: min. 2000 words] (Mark 10%) 
a. Detailed description of the operations/activities performed by the department(s) you worked in. 
b. Master of IT students (Cyber, networking, management): Detailed description of the task(s) 
assigned to you 
c. Master of Data Science Students: detailed description of the project assigned. 
9. Reflective Journal Entries [word limit: min. 2550 words] (Mark 50%) 
a. In reflective journal writing, student will reflect on all activities during Each Week of internship 
in that organization and then will enter in reflective journal on weekly basis, for 13 weeks. 
b. Entry for a single week should be very comprehensive and should include all important 
happenings of that particular week. A comprehensive journal not only includes information on 
assignments and tasks you are given, but also your impression of the organization and the staff at 
your internship. 
c. In case of any leave or holiday students will also mention it in reflective journal along with 
reason for observing that leave. 
Tips for writing Reflective journal 
The following is a helpful formula for reflective journal writing. 
Formula D-I-E-P 
 
D – Describe objectively what happened 
 
 Answer the question, “What did you, see, read, hear etc? 
 
I – Interpret the events 
 
 Explain what you saw and heard;  
 Your new insights;  
 Your connections with other learning, your feelings etc;  
 Your hypotheses; your conclusions; 
 Answer the question what might this mean? 
 
E – Evaluate the effectiveness and efficiency of what was observed 
 
 Make judgments clearly connected to observations made. Evaluation answers 
the question, “What is your opinion about what you observed or 
experienced? Why?  
 
P – Plan how this information will be useful to you  
 
 What are your recommendations? (Be concrete) 
 
Consider:  In what ways this learning experience will serve you in your future? 
 
Remember your Journal Entries, attempt to:  
a. Analyze your own performance as a learner 
b. Evaluate your gains in understanding and completing tasks 
c. Verbalize how you feel about your learning 
d. Make connections with other experiences, ideas 
e. Demonstrate transfer of learning 
f. Integrate the concepts taught in courses 
 
10. Work Samples [word limit for this section is not specified as it depends on the nature of work 
sample] (Mark 7.5%) 
Compile at least 2 samples of your work during your internship. Some examples of work samples include: 
news stories, articles, interviews, projects etc. Each work sample should have a short description of your 
role in that work sample or how you used the sample. 
 
11. Critical Analysis [word limit: min. 1500 words] (Mark 7.5%) 
Relate the theoretical concepts with your practical experience during your internship in the organization. 
Execute an overall analysis of the organization. 
 
12. SWOT Analysis (word limit: min. 1000 words) (Mark 7.5%) 
Clearly describe all the strengths, weaknesses, opportunities and threats of the organization where you have 
done internship. Remember that strengths and weaknesses are internal to the organization and represent its 
culture while opportunities and threats correspond to the environment outside the organization.  
Strengths are those qualities which distinguish or give an edge to the organization over other organizations. 
Weaknesses  are  the  attributes  of  an  organization  that  are  harmful  in  achieving  the  objectives  of  an  
organization. 
Opportunities are the external factors that are helpful in achieving the objectives of the organization. 
Threats are the external factors which could damage the business performance of the organization. 
 
13. Conclusion [word limit: min. 350 words] SWOT Analysis (Mark 2.5%) 
In this section you are required to describe the organization according to your evaluation/assessment in 
the light of critical and SWOT analyses. 
 
14. Recommendation [word limit: min. 300 words] (Mark 2.5%) 
In this section you are required to suggest solutions for all the problems or discrepancies (you have pointed 
out in critical/ SWOT analysis) found in the organization. 
 
15. References & Sources (Mark 2.5%) 
In this section, provide all the references and sources in APA format that you have used for data collection 
in your Internship Report.
"""

In [31]:
def create_nodes(input_text):
  """
  Creates nodes from the input text.

  Args:
    input_text: The text describing the section and its requirements.

  Returns:
    A dictionary of nodes with Node IDs as keys and text descriptions as values.
  """

  nodes = {}
  current_node_id = 1

  # Split the input text into lines
  lines = input_text.splitlines()

  for line in lines:
    line = line.strip()  # Remove leading and trailing whitespaces

    # Check if the line represents a new node (assuming format: "number. Section Name")
    if line and line[0].isdigit() and "." in line:
      # Extract the Node ID
      text_split = line.split(".")
      node_id_str = text_split[0].strip()
      remainder_string = ".".join(text_split[1:]).strip()
      try:
        node_id = int(node_id_str)
      except ValueError:
        print(f"Error: Invalid Node ID format in line: {line}")
        continue

      current_node_id = node_id
      nodes[node_id] = remainder_string + "\n"  # Initialize the node with empty text

    # Otherwise, append the line to the current node's text
    elif line:
      nodes[current_node_id] += line + "\n"

  return nodes

In [32]:
nodes = create_nodes(input_text)
for node_id, node_text in nodes.items():
    print(f"Node {node_id}:\n{node_text}\n")    

Node 1:
Title Page (No title page, then -10%)
The title page of the report will include:
a. Name of the organization
b. Name of the internee, Student ID and session
c. Submission date of the internship report
d. Name of the University
e. MQ logo


Node 2:
Acknowledgment (No title page, then -10%)
In this section you acknowledge the help and support of all the people who helped you in completion of
your internship and internship report.


Node 3:
Executive Summary (Mark 2.5%)
Executive summary previews every section of the report in a short form. It can be called as micro image
of the report. It helps the reader to get a quick glance at the report before reading it in detail. Everything
important that you have done, discovered and concluded should be mentioned but briefly and concisely.


Node 4:
Table of contents (No title page, then -10%)
List the important headings and sub headings in the report with page numbers.  Also make a separate list
of tables and figures in the table of conte

In [33]:
print(nodes)

{1: 'Title Page (No title page, then -10%)\nThe title page of the report will include:\na. Name of the organization\nb. Name of the internee, Student ID and session\nc. Submission date of the internship report\nd. Name of the University\ne. MQ logo\n', 2: 'Acknowledgment (No title page, then -10%)\nIn this section you acknowledge the help and support of all the people who helped you in completion of\nyour internship and internship report.\n', 3: 'Executive Summary (Mark 2.5%)\nExecutive summary previews every section of the report in a short form. It can be called as micro image\nof the report. It helps the reader to get a quick glance at the report before reading it in detail. Everything\nimportant that you have done, discovered and concluded should be mentioned but briefly and concisely.\n', 4: 'Table of contents (No title page, then -10%)\nList the important headings and sub headings in the report with page numbers.  Also make a separate list\nof tables and figures in the table of c

In [34]:
rubric_data

Unnamed: 0,node_id,clear_criteria,simplified_criteria
0,1,The Document should have title page (essential...,*Existing Title Page Section\n*Existing Name o...
1,1.1,Existing Title Page Section (essential),No Need
2,1.2,Existing Name of the organization,No Need
3,1.3,"Existing Name of the internee, Student ID and ...",No Need
4,1.4,Existing Submission date of the internship report,No Need
...,...,...,...
114,13.3,It is required there is harmony between the c...,No Need
115,14,The document should have the recommendation se...,*The document should have the recommendation s...
116,14.1,The document should have the recommendation se...,No Need
117,14.2,The recommendation section content describe su...,No Need


"for all subsections related to weeks from 1 to 13, the  subsections should cover related information about 1- at least one assignment or tasks or activities 2- at least one happenings or events or observation 3- at least one connection and Communication expreince with others and organizations 

For all subsection week 1 to week 13 the information related to 1- assignment or tasks or activities 2- happenings or events or observation 3- connection and Communication expreince with others and organizations  should cover these contents:

for all tasks and assignemnt and activities, describe : 
- Evaluate the effectiveness and efficiency of the tasks or assignments or activities
-Describe objectively, hypotheses of the tasks or assignments or activities
-Describe conclusions of the Students about the tasks or assignments or activities 
-Describe in what ways the learning experience from doing the tasks or assignments or activitie will serve student in their future
-Analyze the student's own performance as a learner during doing of the tasks or assignments or activities
-Evaluate student's gains in understanding and completing tasks or assignments or activitie
-Plan how the laearning from doing tasks or assignments or activities will be useful to students
- What are the student's recommendations about the tasks or assignments or activitie
-how the student feel about the learning during doing the tasks or assignments or activitie
-Demonstrate transfer of learning during doing the tasks or assignments or activitie

for all events or happening or obeservation:
- answer “What did you, see, read, hear etc? related to events or happening or obeservation
- describe Objectivity andhypotheses of events or happening or obeservation
- Describe the student's conclusions of events or happening or obeservation
- Answer the question what might the events or happening or obeservation mean? 
- Describe the student's judgments clearly connected to events or happening or obeservation
- Describe the student's opinion about what you observed or
experienced about events or happening or obeservation
- Describe In what ways this learning experience related to events or happening or obeservation will serve the student in hisor her future
-Analyze the student's own performance as a learner about events or happening or obeservation
- Describe how the information related to events or happening or obeservation will be useful to students
- What are the student's recommendations about events or happening or obeservation
- Describe how the student's feel about her/his learning related to events or happening or obeservation
-Demonstrate transfer of learning related to events or happening or obeservation

For all connection and Communication expreince with others and organizations:
- Evaluate the effectiveness and efficiency of connection and Communication expreince with others and organizations
- Describe the student's conclusions about connection and Communication expreince with others and organizations 
- Describe the student's feelings about connection and Communication expreince with others and organizations
-Describe In what ways the learning experience related to connection and Communication expreince with others and organizations will serve the student in his/her future
- Analyze the student's own performance as a learner relted to connection and Communication expreince with others and organizations
-Plan how the information related to connection and Communication expreince with others and organizations will be useful to the students
-Describe the student's recommendations about the connection and Communication expreince with others and organizations
-Demonstrate transfer of learning relted to connection and Communication expreince with others and organizations"

"* Subsections week 1 should cover related information about 1- at least one assignment or tasks or activities 2- at least one happenings or events or observation 3- at least one connection and Communication expreince with others and organizations 

For subsection week 1 the information related to 1- assignment or tasks or activities 2- happenings or events or observation 3- connection and Communication expreince with others and organizations  should cover these contents:
for all tasks and assignemnt and activities, describe : 
------------relted criteria ----------
for all events or happening or obeservation:
------------relted criteria ----------
For all connection and Communication expreince with others and organizations
------------relted criteria ----------


* Subsections week 2
------Same criteria-----

* Subsections week 3
------Same criteria-----

* Subsections week 4
------Same criteria-----

* Subsections week 5
------Same criteria-----

* Subsections week 6
------Same criteria-----

* Subsections week 7
------Same criteria-----

* Subsections week 8
------Same criteria-----

* Subsections week 9
------Same criteria-----

* Subsections week 10
------Same criteria-----

* Subsections week 11
------Same criteria-----

* Subsections week 12
------Same criteria-----

* Subsections week 13
------Same criteria-----
* Subsections week 3
------Same criteria-----"