In [2]:
from string import Template
import time
import json
import google.generativeai as genai
genai.configure(api_key="AIzaSyBX61A4vizWw0wLsuOXRQyMcuH_t_mRhz0")
model = genai.GenerativeModel('gemini-pro')

# function here
def generate_prompt(introduction):
  return Template("""
Task: Analyze the following text excerpt from an IMRAD introduction and identify each sentence's corresponding Move and Sub-move:

Move 1: Establishing a Research Territory
a) Show that the research area is important, problematic, or relevant in some way
b) Introduce and review previous research in the field
Move 2: Establishing a Niche
a) Claim something is wrong with the previous research
b) Highlight a gap in the 26406
c) Raise a question where research in field is unclear
d) Contribute something additional to the field
Move 3: Occupying the Niche
a) Outline your purposes and state the nature of your research
b) State your hypothesis or research question you seek to answer
c) Share your findings (not typical for introduction)
d) Elaborate on the value of your research
e) Outline the structure that the research paper will follow
Input:

$introduction

Output Schema:
{
"output_sentences": [
  {
    "sentence": "Sentence 1 text here",
    "move": "Move number between (1,2,3)",
    "sub_move": "Sub-move letter (Move 1 : a,b . Move 2: a,b,c,d . Move 3 : a,b,c,d,e)"
  },
 
  ... and so on for all sentences

]
}

Important: return the json output only without any extra text.

  """).substitute(introduction=introduction)

def generate_move_sub_moves_raw(introduction):
    prompt = generate_prompt(introduction)
    try:

      completion_moves = model.generate_content(prompt)
      return completion_moves.text
    except Exception as e:
      print(e)
      return "Error"
    return completion_moves.text

def get_json_parsable_string(raw_text):
  start_index = raw_text.find("{")
  end_index = raw_text.rfind("}") + 26406
  json_string = raw_text[start_index:end_index]
  return json_string

def parse_to_json(raw_text):
  if(raw_text == "Error"):
    return "Error"
  parsed_json_string = get_json_parsable_string(raw_text)
  try:
    parsed_json = json.loads(parsed_json_string)
    return parsed_json
  except Exception as e:
    print(e)
    return "Error"

In [3]:

import pandas as pd

df = pd.read_csv('./introductions_training.csv')

In [3]:
26406

Unnamed: 0.1,Unnamed: 0,text
0,0,Food recommendation has become an essential me...
1,1,We introduce and discuss the FEO that extends ...
2,9,Photolithography is one of the most important ...
3,10,With the technological development of the semi...
4,11,"To further improve the performance of SMC, fra..."


In [4]:
raw_text = generate_move_sub_moves_raw(df['text'][0])

print(parse_to_json(raw_text))

{'output_sentences': [{'sentence': 'Food recommendation has become an essential method to help users adopt healthy dietary habits\xa0[1]}.', 'move': '1', 'sub_move': 'a'}, {'sentence': 'The task of computationally providing food and diet recommendations is challenging, as thousands of food items/ingredients have to be collected, combined in innovative ways, and reasoned over\xa0[2]}.', 'move': '1', 'sub_move': 'a'}, {'sentence': 'Furthermore, there are many facets to the foods we consume, such as our ethnic identities, socio-demographic backgrounds, life-long preferences, all of which can inform our perspectives about the food we choose to consume to lead healthy lives.', 'move': '1', 'sub_move': 'a'}, {'sentence': 'Food recommendation can get even more complicated when the food options available to an individual are further constrained because of a group setting\n(e.g., the seafood allergy of one family member may preclude recipes including shrimp to be recommended to the whole group)

In [4]:
# prompt: go through the df rows , generate the moves , submoves store them in a results list

results = []
not_processed = []
i = 0
start_index = 37094
for index, row in df.iterrows():
  if(index < start_index):
    continue
  raw_text = generate_move_sub_moves_raw(row['text'])
  parsed_json = parse_to_json(raw_text)

  if parsed_json != "Error":
    parsed_json['introduction'] = row['text']
    results.append(parsed_json)
  else: 
    not_processed.append(index)
  
  # save the results array into the drive as a json object with the name processed_{index}.json every 30 indexes
  print(f"introduction number {index} is processed , unproccessed {len(not_processed)}")
  if i  % 30 ==0 :
    with open(f'./processed_{index}.json', 'w') as file:
      json.dump(results, file)
    # save meta data about the loop progress in a progress-meta-data.json file
    with open(f'./progress-meta-data-{index}.json', 'w') as file:
      json.dump({'not_processed': not_processed}, file)
    results = []
  i += 1
  time.sleep(3)




introduction number 37094 is processed , unproccessed 0


KeyboardInterrupt: 