In [2]:
import pandas as pd
df = pd.read_csv('data_preprocessed.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,text,label,label_int,belongs_to,is_processed,gemini_reason,gemini_confidence,gemini_prediction
0,0,f7700f82-5425-4dcc-965f-13b20b13b19d,"We present two experiments, one small prelimin...",m,2,,True,This paragraph describes the experimental setu...,This paragraph describes the experimental setu...,2.0
1,1,393fb8b1-5b1b-4ef0-8499-fd8303f84624,The two adversarial cases considered in this w...,m,2,,True,This paragraph describes the specific adversar...,This paragraph describes the specific adversar...,2.0
2,2,a9aa6217-a201-4463-a5e6-08fe23bfc901,With modern technology having reached a stage ...,i,1,,True,The paragraph sets the stage for the research ...,The paragraph sets the stage for the research ...,0.0
3,3,276590e3-98b1-4b78-b3b9-d3a3dd75c6cc,Systems are generally considered as complex if...,r,3,,True,The paragraph discusses the implications of th...,The paragraph discusses the implications of th...,1.0
4,4,798d3d4c-278f-4749-a8b7-e2dbfd64e6e9,Near-infrared detection experimentation will h...,w,4,,True,The paragraph discusses potential future resea...,The paragraph discusses potential future resea...,1.0


In [5]:
df['gemini_prediction'].value_counts()

gemini_prediction
 2.0    7587
 0.0    5298
 3.0    4850
 1.0    4339
 4.0    2763
-1.0      93
Name: count, dtype: int64

In [6]:
df['label_int'].value_counts()

label_int
2    4992
1    4992
3    4992
4    4992
0    4992
Name: count, dtype: int64

In [7]:
from dotenv import load_dotenv
from langchain.globals import set_llm_cache
from langchain_community.cache import InMemoryCache
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from enum import Enum, IntEnum


class ParagraphType(IntEnum):
    i = 0 
    d = 1
    m = 2
    r = 3
    w = 4
    o = -1

class Prediction(BaseModel):
    belongs_to:ParagraphType = Field(description="An integer representing the type of paragraph. 0 for Introduction, 1 for Discussion, 2 for Methodology, 3 for  Results, 4 for Related Work,  -1 for Outlier.")
    reason:str = Field(description="The reason for the prediction.")




    

load_dotenv()

#os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

print("init model-----------------------------")
model:ChatGoogleGenerativeAI = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=3,
generation_config={"response_mime_type": "application/json"}
    
)
#set_llm_cache(InMemoryCache())

parser = PydanticOutputParser(pydantic_object=Prediction)

prompt = PromptTemplate(
    template="You're an Expert in IMRAD research papers.\n Given the paragraph extracted from an IMRAD paper. Determine to which IMRAD section it belongs to.\n Paragraph: \n {text}\n {format_instructions} \n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},

)


chain = prompt | model | parser

AttributeError: module 'sqlalchemy.sql' has no attribute 'base'

In [68]:
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"belongs_to": {"description": "An integer representing the type of paragraph. 0 for Introduction, 1 for Discussion, 2 for Methodology, 3 for  Results, 4 for Related Work,  -1 for Outlier.", "allOf": [{"$ref": "#/definitions/ParagraphType"}]}, "reason": {"title": "Reason", "description": "The reason for the prediction.", "type": "string"}}, "required": ["belongs_to", "reason"], "definitions": {"ParagraphType": {"title": "ParagraphType", "description": "An enumeration.", "enum": [0, 1, 2, 3, 4, -1], "type": "integer"}}}
```


In [69]:
print(prompt.invoke({'text':df['text'][0]}))

text='You\'re an Expert in IMRAD research papers.\n Given the paragraph extracted from an IMRAD paper. Determine to which IMRAD section it belongs to.\n Paragraph: \n We present two experiments, one small preliminary experiment trained on two\nmonths of data to compare our approach to Bayesian networks; and another\nlarge-scale experiment trained on two and a half years of data and tested on\nthree months of data.This distinction is made due to the conclusive\nresults of the preliminary experiment, and computation considerations for the\ntraining of Bayesian networks.\nOur model is coded in PyTorch, and will be available online in the final\nversion of the paper, as described in Appendix\xa0REF .\n\n The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["

In [56]:
df['text'][0]

'We present two experiments, one small preliminary experiment trained on two\nmonths of data to compare our approach to Bayesian networks; and another\nlarge-scale experiment trained on two and a half years of data and tested on\nthree months of data.This distinction is made due to the conclusive\nresults of the preliminary experiment, and computation considerations for the\ntraining of Bayesian networks.\nOur model is coded in PyTorch, and will be available online in the final\nversion of the paper, as described in Appendix\xa0REF .\n'

In [71]:
print(chain.invoke({'text':df['text'][1]}))

belongs_to=<ParagraphType.m: 2> reason='This paragraph describes the specific adversarial cases (injection of ADD and JMP instructions) and their respective impacts on CPU cycles. This level of technical detail strongly suggests it belongs to the Methodology section, where the experimental setup and procedures are explained.'


In [73]:
#df['belongs_to'] = None
#df['is_processed'] = False
#df['gemini_reason'] = None
#df['gemini_confidence'] = None

In [89]:
df['is_processed'].value_counts()

is_processed
True     24930
False       30
Name: count, dtype: int64

In [80]:
# save the data
df.to_csv('data_preprocessed.csv', index=False)

In [90]:
import time


for i in range(0,len(df)):
    
    
    if(df['is_processed'][i] == True):
        print(f'row {i} already processed ............................skipping')
        continue
    try:
        result:Prediction = chain.invoke({'text':df['text'][i]}) 
        print(f'row {i} processed')
        print(result)
         
        df.at[i,'is_processed'] = True
        df.at[i,'gemini_prediction'] =  result.belongs_to
        df.at[i,'gemini_reason'] =result.reason 

    except Exception as e:
        print(f'error processing row {i}')
        print(e)
        continue
        # use at 
    if(  i%100 == 0):
        print(f'processed {i} rows')
        print('saving data')
        df.to_csv('data_preprocessed.csv',index=False)


row 0 already processed ............................skipping
row 1 already processed ............................skipping
row 2 already processed ............................skipping
row 3 already processed ............................skipping
row 4 already processed ............................skipping
row 5 already processed ............................skipping
row 6 already processed ............................skipping
row 7 already processed ............................skipping
row 8 already processed ............................skipping
row 9 already processed ............................skipping
row 10 already processed ............................skipping
row 11 already processed ............................skipping
row 12 already processed ............................skipping
row 13 already processed ............................skipping
row 14 already processed ............................skipping
row 15 already processed ............................skipping
row 16 already pro