In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4o-mini", temperature=0.3)



## PROMPT ENGINEERING

In [4]:
from langchain.prompts import ChatPromptTemplate

#### Role

In [5]:
Role = """An Educational AI model designed to Rephrase text"""

#### Task

In [6]:
Task = """To convert the text such that your response """

#### Style / Tag / Keywords

1. less technical terms

In [7]:
style = """should be simpler and use fewer technical terms compared to the original input"""

2. Easier language

In [8]:
style = """is in a simple and Easy to understand language"""

3. More Pointers

In [9]:
style = """should include additional points or details, expanding the content to increase its overall length and provide more information"""

<p style = "color: red">My Insights starts now</p>

4. Mention the topics covered in the text answer in the ending of the response  --- tobe worked on later

In [10]:
style = """include a summary that explicitly lists the topics covered in the text at the end of the response."""

5. Use Mnemonics or Memory Aids  -- not feasible  -- in the initial phase

In [11]:
style = """Include simple mnemonics or memory aids to help the student remember key concepts."""

6. Use Analogy

In [12]:
style = """explains the text including an analogy of general everyday activities one can easily infer """

7. Include examples

In [13]:
style = """Ensure the inclusion of examples if not already provided, with the quantity ranging from 2 to 5, wherever feasible."""

8. Visual Representation --- Not feasible
9. Highlight practical applications or real-world relevance of the concepts covered in the text.
10. tag - theoritical  --- numerical -- 

#### Instructions

In [14]:
instructions = (
    "Rephrase the provided text such that it is Concise while retaining its original meaning. \n"
    "The generated rephrased answer word length should be around that of the given content. \n"
    "Ensure the rephrased response remains in the context of the provided content without introducing unrelated information. \n"
)

#### Ouput_Format

In [15]:
output_format = (
    "Provide the output in JSON format with the following keys:\n"
    "- `input_text`: The original input text.\n"
    "- `rephrased_text_response`: The rephrased version of the input text."
    "Ensure there is no additional text, commentary, or formatting outside the JSON structure."

)

<h3 style="color: green">TEMPLATE STRING</h3>

template_string = (
    "You're an Educational AI model designed to Rephrase text, which has all the knowledge of the undergraduate Bachelors of Technology Course."
    "Your task is rephrase the given text based on the your {style} --<p style ="color : red"> yaha pe bhi test karna hai,</p> the context, and the given guidelines. \n\n"
    "Your writing style : {style}. "
    "Follow these instructions: {instructions}\n"
    "Text: ```{text}```\n"
    "{output_format}"
)

In [16]:
template_string = (
    "You're an Educational AI model designed to Rephrase text, which has all the knowledge of the undergraduate Bachelors of Technology Course."
    "Your task is rephrase the given text based on the your style, the context, and the given guidelines. \n\n"
    "Your writing style : {style}. \n\n"
    "Follow these instructions to generate a good quality rephrased answer: {instructions}\n"
    "Given content to be used for rephrasing: ```{text}```\n\n"
    "{output_format} \n\n"
    "Ensure that you follow all the above guidelines and rules to generate the rephrased content. Any deviation from these guidelines \
    will result in rephrased content not meeting the education standards required for this exercise."

)

In [17]:
prompt_template_1 = ChatPromptTemplate.from_template(template_string)


In [18]:
print(prompt_template_1)

input_variables=['instructions', 'output_format', 'style', 'text'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['instructions', 'output_format', 'style', 'text'], input_types={}, partial_variables={}, template="You're an Educational AI model designed to Rephrase text, which has all the knowledge of the undergraduate Bachelors of Technology Course.Your task is rephrase the given text based on the your style, the context, and the given guidelines. \n\nYour writing style : {style}. \n\nFollow these instructions to generate a good quality rephrased answer: {instructions}\nGiven content to be used for rephrasing: ```{text}```\n\n{output_format} \n\nEnsure that you follow all the above guidelines and rules to generate the rephrased content. Any deviation from these guidelines     will result in rephrased content not meeting the education standards required for this exercise."), additional_kwargs={})]


<!-- messages = prompt_template_1.format_messages(text=input_text, Task=Task, Role=Role, style=style, instructions=instructions, output_format=output_format)
 -->

<h1 style = "color : blue; font-family : verdana">RUNNING ON THE FINAL SAMPLE OF PME DATA</h1>

## Importing the csv

In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('genSum.csv')

In [22]:
df.head()

Unnamed: 0,year,branch,subject,unit,summary
0,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que1', 'summarizedAnswer': '{\n ""s..."
1,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que2', 'summarizedAnswer': '{\n ""s..."
2,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que3', 'summarizedAnswer': '{\n ""s..."
3,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que4', 'summarizedAnswer': '```json\n..."
4,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que5', 'summarizedAnswer': '```json\n..."


## Data cleaning and Preprocessing

In [23]:
# converting the string to dictionary of the summary column
import ast
df['summarized_ans'] = df['summary'].apply(ast.literal_eval)

In [24]:
df.head()

Unnamed: 0,year,branch,subject,unit,summary,summarized_ans
0,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que1', 'summarizedAnswer': '{\n ""s...","{'qid': 'que1', 'summarizedAnswer': '{  ""su..."
1,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que2', 'summarizedAnswer': '{\n ""s...","{'qid': 'que2', 'summarizedAnswer': '{  ""su..."
2,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que3', 'summarizedAnswer': '{\n ""s...","{'qid': 'que3', 'summarizedAnswer': '{  ""su..."
3,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que4', 'summarizedAnswer': '```json\n...","{'qid': 'que4', 'summarizedAnswer': '```json {..."
4,4,Computer Science Engineering,PME,Unit 1,"{'qid': 'que5', 'summarizedAnswer': '```json\n...","{'qid': 'que5', 'summarizedAnswer': '```json {..."


In [25]:
print(len(df.summarized_ans))

18


In [26]:
for i in df.summarized_ans:
    print(type(i))

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'dict'>


## making a list of all these dicts of summarized ans to pass it as input text for the model

In [27]:
summarizedAnswerList = []
for i in df.summarized_ans:
    summarizedAnswerList.append(i['summarizedAnswer'])

In [28]:
print(summarizedAnswerList)

['{\n    "summarized_answer": "An entrepreneur is a business founder who identifies opportunities, assumes risks, and efficiently uses resources to generate profit and contribute to economic growth. Key characteristics include: \\n\\n- **Innovation**: Introducing new or improved products/services.\\n- **Risk-taking**: Willingness to accept financial, professional, and personal risks.\\n- **Vision and Goal Orientation**: Clear long-term goals drive decision-making.\\n- **Resilience**: Overcoming hurdles and staying motivated.\\n- **Self-Confidence**: Belief in achieving goals and mobilizing resources.\\n- **Flexibility**: Adapting strategies to changing market conditions.\\n- **Leadership Skills**: Inspiring teams and creating a constructive work environment.\\n- **Persistence**: Continually striving to achieve goals despite obstacles.\\n- **Opportunity Identification**: Seeing prospects where others see problems.\\n- **Financial Acumen**: Understanding budgeting, forecasting, and resou

In [29]:
print(len(summarizedAnswerList))

18


## getting REPHRASED outputs

In [38]:
def parse_Response(response):
   if isinstance(response, str):
       # print(response)
       start_index = response.find("{")
       end_index = response.rfind("}")
       if start_index != -1 and end_index != -1:
           valid_json_content = response[start_index : end_index + 1]
           try:
               JSON_response = json.loads(valid_json_content.replace("\n", ""))
               # append_list_to_file(JSON_response)
               return JSON_response
           except json.JSONDecodeError as e:
               print(f"Error decoding JSON response: {e.__class__.__name__} - {e}\n\n Still trying to work on particular exceptions ...")
               print("Actual Content: ", valid_json_content)
       else:
           print("No valid JSON content found in the response.")
       # time.sleep(50)
   elif isinstance(response, dict):
       return response
   else:
       print("No response message found", type(response))


### database 

1. Motor
2. Pymongo

In [30]:
rephrasedList = []
for summary in summarizedAnswerList:
    particular_message = prompt_template_1.format_messages(text=summary, Task=Task, Role=Role, style=style, instructions=instructions, output_format=output_format)
    response = model.invoke(particular_message)
    res_content = response.content
    rephrasedList.append(res_content)

In [32]:
response

AIMessage(content='```json\n{\n    "input_text": "Problems faced by EDPs include: \\n\\n- **Limited Awareness**: Rural individuals often lack awareness of EDPs, missing out on supportive services for business enhancement.\\n- **Inadequate Funding**: Budget constraints limit training quality, reach, and access to digital tools, affecting program effectiveness.\\n- **Lack of Skills**: Skills provided may not align with industry demands, lacking emerging trends and technologies.\\n- **Poor After-Training Support**: Insufficient post-training support leads to challenges in sustaining businesses due to lack of mentoring and advisory services.\\n- **Regulatory Challenges**: New entrepreneurs face compliance, licensing, and tax issues, especially those from rural backgrounds.\\n- **Negligence of Customized Training**: Standardized training fails to meet specific needs, such as those of women balancing work and family or rural entrepreneurs needing local market information.",\n    "rephrased_t

In [31]:
print(rephrasedList)

['```json\n{\n    "input_text": "An entrepreneur is a business founder who identifies opportunities, assumes risks, and efficiently uses resources to generate profit and contribute to economic growth. Key characteristics include: \\n\\n- **Innovation**: Introducing new or improved products/services.\\n- **Risk-taking**: Willingness to accept financial, professional, and personal risks.\\n- **Vision and Goal Orientation**: Clear long-term goals drive decision-making.\\n- **Resilience**: Overcoming hurdles and staying motivated.\\n- **Self-Confidence**: Belief in achieving goals and mobilizing resources.\\n- **Flexibility**: Adapting strategies to changing market conditions.\\n- **Leadership Skills**: Inspiring teams and creating a constructive work environment.\\n- **Persistence**: Continually striving to achieve goals despite obstacles.\\n- **Opportunity Identification**: Seeing prospects where others see problems.\\n- **Financial Acumen**: Understanding budgeting, forecasting, and res

In [33]:
count = 1
for i in rephrasedList:
    print(count)
    count += 1
    print(i[:10])
    print(i[-10:])
    print("END \n")

1
```json
{

ss."
}
```
END 

2
```json
{

ly."
}
```
END 

3
```json
{

ds."
}
```
END 

4
```json
{

ks."
}
```
END 

5
```json
{

my."
}
```
END 

6
```json
{

ss."
}
```
END 

7
```json
{

ls."
}
```
END 

8
```json
{

es."
}
```
END 

9
```json
{

es."
}
```
END 

10
```json
{

rs."
}
```
END 

11
```json
{

es."
}
```
END 

12
```json
{

as."
}
```
END 

13
```json
{

es."
}
```
END 

14
```json
{

rs."
}
```
END 

15
```json
{

es."
}
```
END 

16
```json
{

es."
}
```
END 

17
```json
{

es."
}
```
END 

18
```json
{

ts."
}
```
END 



from above it can be seen that there are some outputs which include the substring ```json in the starting \
this could affect the output parsed quality

In [34]:
back_count = 0
for i in rephrasedList:
    if i[0] == '`':
        back_count += 1

print(back_count)

18


In [35]:
ind = 0
for i in rephrasedList[0]:
    if i == '{' :
        break
    else:
        ind+= 1

print(ind)

8


## parsing the rephrased outputs into json format

#### First I will remove the substring ```json from starting and from last if it is present

In [36]:
rephrasedList_2 = []
for res in rephrasedList:
    if res[0] == '`':
        rephrasedList_2.append(res[8:-4])
    else:
        rephrasedList_2.append(res)

In [37]:
for i in rephrasedList_2:
    print(i[:10])
    print(i[-10:])

{
    "inp
siness."
}
{
    "inp
tively."
}
{
    "inp
emands."
}
{
    "inp
 risks."
}
{
    "inp
conomy."
}
{
    "inp
uccess."
}
{
    "inp
 goals."
}
{
    "inp
ources."
}
{
    "inp
ctives."
}
{
    "inp
eavors."
}
{
    "inp
cesses."
}
{
    "inp
 ideas."
}
{
    "inp
 sales."
}
{
    "inp
eavors."
}
{
    "inp
lenges."
}
{
    "inp
prises."
}
{
    "inp
stries."
}
{
    "inp
sights."
}


In [None]:
print(len(rephrasedList_2))

In [None]:
print(rephrasedList_2[0])

### now I will parse this cleaned strings into json format

In [None]:
final_list = []

In [None]:
count = 0

for rephrasedAns in rephrasedList_2:
    try:
        parsed_rephrased_ans = json.loads(rephrasedAns)
        final_list.append(parsed_rephrased_ans)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON", e)
        count += 1

print(count)

In [None]:
print(final_list)

In [None]:
for i in final_list:
    print(type(i))

In [None]:
summAnsList = []
rephrasedAnsList = []
for i in final_list:
    summAnsList.append(i['input_text'])
    rephrasedAnsList.append(i['rephrased_text_response'])

In [None]:
final_df = pd.DataFrame({'Summary' : summAnsList,
                         'Rephrased Ans' : rephrasedAnsList
                        })

In [None]:
final_df.head()

In [None]:
final_df.to_csv('sample_1.csv', index=False)