In [1]:
import pandas as pd
df = pd.read_csv('data_preprocessed.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,text,label,label_int,belongs_to,is_processed,gemini_reason,gemini_confidence,gemini_prediction
0,0,f7700f82-5425-4dcc-965f-13b20b13b19d,"We present two experiments, one small prelimin...",m,2,,True,This paragraph describes the experimental setu...,This paragraph describes the experimental setu...,2.0
1,1,393fb8b1-5b1b-4ef0-8499-fd8303f84624,The two adversarial cases considered in this w...,m,2,,True,This paragraph describes the specific adversar...,This paragraph describes the specific adversar...,2.0
2,2,a9aa6217-a201-4463-a5e6-08fe23bfc901,With modern technology having reached a stage ...,i,1,,True,The paragraph sets the stage for the research ...,The paragraph sets the stage for the research ...,0.0
3,3,276590e3-98b1-4b78-b3b9-d3a3dd75c6cc,Systems are generally considered as complex if...,r,3,,True,The paragraph discusses the implications of th...,The paragraph discusses the implications of th...,1.0
4,4,798d3d4c-278f-4749-a8b7-e2dbfd64e6e9,Near-infrared detection experimentation will h...,w,4,,True,The paragraph discusses potential future resea...,The paragraph discusses potential future resea...,1.0


In [11]:
df['gemini_prediction'].value_counts()

gemini_prediction
 2.0    7587
 0.0    5298
 3.0    4850
 1.0    4339
 4.0    2763
-1.0      93
Name: count, dtype: int64

In [5]:
df_filtered = df[df['gemini_prediction'] != -1]

In [12]:
df_filtered['gemini_prediction'].value_counts()


gemini_prediction
2.0    7587
0.0    5298
3.0    4850
1.0    4339
4.0    2763
Name: count, dtype: int64

In [13]:
df = df_filtered

In [27]:

from dotenv import load_dotenv
from langchain.globals import set_llm_cache
from langchain_community.cache import InMemoryCache
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from enum import Enum, IntEnum





class Prediction(BaseModel):
    text: str = Field(title="Generated Paragraph", description="The Paragraph generated by the model")
    

load_dotenv()

#os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

print("init model-----------------------------")
model:ChatGoogleGenerativeAI = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=1,
    max_tokens=None,
    timeout=None,
    max_retries=3,
generation_config={"response_mime_type": "application/json"}
    
)
#set_llm_cache(InMemoryCache())

parser = PydanticOutputParser(pydantic_object=Prediction)

prompt = PromptTemplate(
    template="You're an Expert in IMRAD research papers.\n Given the paragraph extracted from an IMRAD paper and its corresponding class (the IMRAD section  the paragraph belongs to). Generate a similar paragraph belonging to the same class.\n Paragraph: \n {text}\n. Paragraph class: {section} \n. {format_instructions} \n",
    input_variables=["text","section"],
    partial_variables={"format_instructions": parser.get_format_instructions()},

)


chain = prompt | model | parser

def augment_paragraph(paragraph:str,section_id:int):
    dict = {
    0:"Introduction",
    1:"Discussion",
    2:"Methodology",
    3:"Results",
    4:"Related Work"
    }
    try:
        return chain.invoke({'section':dict[section_id],'text':paragraph})
    except: 
      return -1

init model-----------------------------


In [28]:
df['gemini_prediction'].value_counts()

gemini_prediction
2.0    7587
0.0    5298
3.0    4850
1.0    4339
4.0    2763
Name: count, dtype: int64

In [30]:
import pandas as pd
import os

# Path to the CSV file
csv_file_path = 'augmented_data.csv'

# Check if the file already exists
if os.path.exists(csv_file_path):
    # Load the existing data
    augmented_df = pd.read_csv(csv_file_path)
    # Determine the last processed index
    last_processed_index = augmented_df.index[-1] if not augmented_df.empty else -1
else:
    # Create an empty DataFrame if the file doesn't exist
    augmented_df = pd.DataFrame(columns=['gemini_prediction', 'text'])
    last_processed_index = -1

# Initialize an empty list to collect new rows
augmented_data = []

# Iterate over the DataFrame starting from the next index after the last processed one
for index, row in df.iterrows():
    if index <= last_processed_index:
        continue  # Skip already processed rows

    class_label = row['gemini_prediction']
    text = row['text']

    augmented_text = augment_paragraph(text, class_label)

    # Append the dictionary directly to the list
    augmented_data.append({'gemini_prediction': class_label, 'text': augmented_text})

    print({'index': index, 'class': class_label, 'augmented_text': augmented_text})

    if index % 50 == 0:
        # Convert the list to a DataFrame and append it to the existing CSV
        new_augmented_df = pd.DataFrame(augmented_data, columns=['gemini_prediction', 'text'])
        new_augmented_df.to_csv(csv_file_path, mode='a', header=False, index=False)
        # Clear the list after saving
        augmented_data = []

# Save any remaining data after the loop
if len(augmented_data) > 0:
    new_augmented_df = pd.DataFrame(augmented_data, columns=['gemini_prediction', 'text'])
    new_augmented_df.to_csv(csv_file_path, mode='a', header=False, index=False)


{'index': 0, 'class': 2.0, 'augmented_text': Prediction(text='To assess the efficacy of our proposed methodology, we conducted two separate experiments. The first experiment, a pilot study, involved a smaller dataset spanning a period of six months. This pilot study served to validate the feasibility of our approach and to compare its performance against traditional machine learning techniques. The second experiment, a larger-scale evaluation, utilized a dataset encompassing a period of three years. This experiment aimed to rigorously test the generalizability and robustness of our methodology under real-world conditions. The distinction between these two experiments was driven by the need to ensure comprehensive evaluation across different data scales and complexities.')}
{'index': 1, 'class': 2.0, 'augmented_text': -1}
{'index': 2, 'class': 0.0, 'augmented_text': Prediction(text='The increasing popularity of mobile devices and their integration into everyday life has created a unique

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised DeadlineExceeded: 504 Deadline Exceeded.


{'index': 741, 'class': 0.0, 'augmented_text': -1}
{'index': 742, 'class': 3.0, 'augmented_text': -1}
{'index': 743, 'class': 0.0, 'augmented_text': Prediction(text="This paper addresses the challenge of efficiently updating large-scale language models across distributed training environments. We introduce a novel approach, termed Adaptive Gradient Compression (AGC), which dynamically adjusts the compression level of gradients based on the model's sensitivity to specific updates.  We demonstrate the effectiveness of AGC through extensive experimentation on various natural language processing tasks, including machine translation and text summarization, showcasing significant reductions in communication overhead while maintaining performance parity with standard gradient descent.")}
{'index': 744, 'class': 0.0, 'augmented_text': Prediction(text='This paper presents a novel approach for image segmentation based on a deep learning framework. The proposed method leverages the power of convo