<a href="https://colab.research.google.com/github/rishantmallick/conference-predictor/blob/main/KDSH_conference_selector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install langgraph
!pip install  openai
!pip install PyPDF2
!pip install scikit-learn
!pip install -U langchain-tavily
!pip install textstat
!pip install -U langchain-community
!pip install arxiv --upgrade
!pip install Pydantic
!pip install langchain_core
!pip install langchain_openai
!pip install serpapi
!pip install google-search-results
!pip install scikit-learn
!pip install gensim
!pip install nltk
!pip install pypdf
!pip install numpy
!pip install --force-reinstall --no-cache-dir numpy gensim
!pip install faiss-cpu
!pip install tiktoken
!pip install langchain_google_vertexai
!pip install google-generativeai
!pip install langchain_google_genai

In [None]:
import openai
import os
from openai import OpenAI
import PyPDF2
import re
import langchain
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI
import seaborn as sns
import matplotlib.pyplot as plt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import userdata
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_tavily import TavilySearch
from dotenv import load_dotenv
load_dotenv()
# Initialize OpenAI client
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY_1')
os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_KEY')
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
import pydantic
import json
import faiss
from pydantic import BaseModel,Field
from langchain_core.tools import tool
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langchain_openai  import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.messages import BaseMessage
from typing import Annotated, Any
from typing_extensions import TypedDict
from langchain.vectorstores import FAISS

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)
tavily = TavilySearch(max_results=5,search_depth="advanced",format="json")

# Set up the API wrapper

import serpapi
tools = [tavily]
from langchain_core.messages import AIMessage,ToolMessage
llm_with_tools=llm.bind_tools(tools=tools)
class Researchpaper(TypedDict):
  messages:Annotated[list,add_messages]
  paper_content: str
  search_results: list
  scores: dict
  red_result : str
graph_builder = StateGraph(Researchpaper)

def parse_scores(response: str) -> dict:
  scores = dict(line.split(": ") for line in response.strip().splitlines())
  scores = {k: float(v) for k, v in scores.items()}
  return scores

def initial_analysis(paper:Researchpaper)->Researchpaper:
  #initiate search queries
  messages = [
        {"role":"system", "content": """Analyze the research paper and generate search queries to:
        1. Find similar papers for methodology verification
        2. Find cited references for fact-checking
        3. Find competing approaches for comparison"""},
        {"role" : "user","content": paper["paper_content"]}
    ]
  response = llm_with_tools.invoke(messages)
  print(response)


# Extract queries

  paper["messages"] = response

  return paper

def process_results(paper:Researchpaper)->Researchpaper:

    last_tool_message = paper["messages"][-1]
    print(last_tool_message)

    # Extract from ToolMessage content (not tool_calls)
    if isinstance(last_tool_message, ToolMessage):
        # Tavily results are already in message content
        search_results = last_tool_message.content
        paper["search_results"] = json.loads(search_results)
    else:
        print("Unexpected message type:", type(last_tool_message))
    return paper



def finalscore(state:Researchpaper)->Researchpaper:

    analysis_prompt = [
        {"role":"system", "content":f"""Analyze the paper using these steps:
        1. Cross-verify methodology with {state["search_results"] }
        2. Check factual claims against found references
        3. Compare results with similar papers
        4. Score each category (0-10) with explanations
        5. all of these parameters methodology, technical soundness, factual correctness, rigorousity, novelty, results, clarity, and ethics
        should be on a 0–10 scale
        6. total should be sum of the values of all the parameters and should be on a 0-80 scale
        7. the score should be floating point number rounded upto two decimal places."""},

        {"role":"user", "content":f'''Paper content:\n{state["paper_content"]}\n\nReferences:\n{state["search_results"]}
                     This is the format you must follow for scoring:
                     methodology: X
                     technical_soundness: X
                     factual_correctness: X
                     rigorousity: X
                     novelty: X
                     results: X
                     clarity: X
                     ethics: X
                     total : X
                     Display only the scores in this format.'''}
    ]

    response = llm.invoke(analysis_prompt)
    print(response)
    return {"scores": parse_scores(response.content)}
graph_builder.add_node("analyze_paper",initial_analysis)
graph_builder.add_node("process_results", process_results)
graph_builder.add_node("tools", ToolNode([tavily]))


graph_builder.add_node("finalscore", finalscore)
graph_builder.set_entry_point("analyze_paper")
graph_builder.add_edge("analyze_paper", "tools")
graph_builder.add_edge("tools", "process_results")
graph_builder.add_edge("process_results", "finalscore")
graph_builder.add_edge("finalscore", END)
graph = graph_builder.compile()


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install gdown
!gdown --folder "https://drive.google.com/drive/folders/1-658SR6wI7EBthpHFJDHmJ_0iyaubU-f"
import numpy as np
def conference_teller(fname):
  if(fname=="R006.pdf" or fname=="R007.pdf"):return "CVPR"
  elif (fname=="R008.pdf"or fname=="R009.pdf"):return"EMNLP"
  elif (fname=="R010.pdf"or fname=="R011.pdf"):return"KDD"
  elif (fname=="R012.pdf"or fname=="R013.pdf"):return"NeurIPS"
  else: return "TMLR"
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
all_chunks = []
dicd = []
for i in range(1,16):
    docs1=" "
    if(i>=1 and i<=5):
       base_dir = "/content/Reference/Non-Publishable"
    else:
       base_dir = "/content/Reference/Publishable"
    fname = f"R{i:03}.pdf"
    if(i>=1 and i<=5):
      file_path = os.path.join(base_dir,fname)
    if(i>=6 and i<=15):
      conference = conference_teller(fname)
      file_path = os.path.join(base_dir, conference, fname)
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    for doc in docs:
       docs1 = docs1 + doc.page_content
       fname = f"R{i:03}.pdf"
       doc.metadata["source_file"] = fname
       if(i>=6 and i<=15):
         doc.metadata["conference"] = conference_teller(fname)
    if(i>=6 and i<=15):
      chunks = splitter.split_documents(docs)
      all_chunks.extend(chunks)
    inputs = {
    "messages": [],
    "paper_content": docs1,
    "search_results": [],
    "scores": {},
    "red_result": " "}
    final_output = None

# Run the graph step by step
    for step in graph.stream({
    "messages": [],
    "paper_content": docs1,
    "search_results": [],
    "scores": {},
    "red_result": " "
}): print(step)

    dicd.append(step['finalscore']["scores"])


In [None]:
data = np.array([[dic['methodology'],dic['technical_soundness'],dic['factual_correctness'],dic['rigorousity'],dic['novelty']
                            ,dic['results'],dic['clarity'],dic['ethics'], dic['total']]   for dic in dicd])
label = np.array([0,0,0,0,0,1,1,1,1,1,1,1,1,1,1])
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print(y_pred)
accuracy = accuracy_score(y_test, y_pred)
F1_score = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(F1_score)
print(accuracy)
print(cm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
import xml.etree.ElementTree as ET
def xml_to_readable_string(xml_data):
    xml_data = xml_data.strip()  # Remove leading/trailing whitespace

    # Handle potential Byte Order Mark (BOM) if present
    if xml_data.startswith('\ufeff'):
        xml_data = xml_data[1:]

    try:
        root = ET.fromstring(xml_data)
    except ET.ParseError as e:
        # If basic cleaning doesn't work, try more aggressive cleaning
        xml_data = xml_data.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
        xml_data = ' '.join(xml_data.split())  # Collapse multiple spaces
        root = ET.fromstring(xml_data)

    # Define namespaces
    ns = {
        'atom': 'http://www.w3.org/2005/Atom',
        'arxiv': 'http://arxiv.org/schemas/atom',
        'opensearch': 'http://a9.com/-/spec/opensearch/1.1/'
    }

    # Initialize result string
    result = []

    # Add search metadata
    result.append("=== ARXIV SEARCH RESULTS ===")
    result.append(f"Query: {root.find('atom:title', ns).text}")
    result.append(f"Total Results: {root.find('opensearch:totalResults', ns).text}")
    result.append(f"Showing: {root.find('opensearch:itemsPerPage', ns).text} of {root.find('opensearch:totalResults', ns).text}")
    result.append("\n")

    # Process each entry
    for entry in root.findall('atom:entry', ns):
        # Basic info
        result.append(f"Title: {entry.find('atom:title', ns).text.strip()}")
        result.append(f"Authors: {', '.join([a.find('atom:name', ns).text for a in entry.findall('atom:author', ns)])}")
        result.append(f"Published: {entry.find('atom:published', ns).text}")
        result.append(f"Updated: {entry.find('atom:updated', ns).text}")

        # Links
        pdf_link = [link.get('href') for link in entry.findall('atom:link', ns)
                   if link.get('type') == 'application/pdf'][0]
        result.append(f"PDF: {pdf_link}")

        # Abstract
        abstract = entry.find('atom:summary', ns).text.strip()
        result.append(f"\nAbstract:\n{abstract}\n")

        # Categories
        categories = [cat.get('term') for cat in entry.findall('atom:category', ns)]
        result.append(f"Categories: {', '.join(categories)}\n")

        result.append("="*50 + "\n")

    return '\n'.join(result)
def extract_url(xmll):
  import xml.etree.ElementTree as ET
  feed = ET.fromstring(xmll)  # Replace xml_content with your XML string
  # Namespace handling (required for Atom feeds)
  ns = {'atom': 'http://www.w3.org/2005/Atom'}
  # Extract PDF URLs
  pdf_urls = []
  for entry in feed.findall('atom:entry', ns):
    for link in entry.findall('atom:link', ns):
        if link.get('title') == 'pdf' and link.get('rel') == 'related':
            pdf_urls.append(link.get('href'))
  return pdf_urls

def extract_rel(xmll):
  import xml.etree.ElementTree as ET
  # Load your XML content here
  tree = ET.fromstring(xmll)
  titless = []
  abs=[]
  cats = []
  # Namespace for arXiv Atom feed
  ns = {'atom': 'http://www.w3.org/2005/Atom'}
  # Iterate through each entry in the feed
  for entry in tree.findall('atom:entry', ns):
    title = entry.find('atom:title', ns).text.strip()
    abstract = entry.find('atom:summary', ns).text.strip()

    # The 'category' element has an attribute 'term'
    category_elem = entry.find('atom:category', ns)
    category_term = category_elem.attrib['term'] if category_elem is not None else 'N/A'
    titless.append(title)
    abs.append(abstract)
    cats.append(category_term)
  return (titless,abs,cats)

def clean_research_paper(text):

    # Remove References (case-insensitive)
    text = re.sub(r'(?i)\breferences\b.*', '', text, flags=re.DOTALL)

    # Remove Appendix (case-insensitive)
    text = re.sub(r'(?i)\bappendix\b.*', '', text, flags=re.DOTALL)

    # Remove Conclusion (case-insensitive)
    text = re.sub(r'(?i)\bconclusion\b.*', '', text, flags=re.DOTALL)
    text = re.sub(r'(?i)\bconclusions\b.*', '', text, flags=re.DOTALL)
    # Remove Acknowledgements (optional)
    text = re.sub(r'(?i)\backnowledgements\b.*', '', text, flags=re.DOTALL)


    # Remove multiple newlines and extra spaces
    text = re.sub(r'\n\s*\n', '\n\n', text)

    return text

def complete_operation(pdf_url,ctext):
  import requests
  from pathlib import Path
  docs=[]
  i = 1
  from sklearn.feature_extraction.text import TfidfVectorizer
  from sklearn.metrics.pairwise import cosine_similarity
  def preprocess(text):
    # Lowercase + remove special chars
      text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Remove stopwords
      stop_words = set(stopwords.words('english'))
      words = text.split()
      words = [w for w in words if w not in stop_words]

    # Lemmatization
      lemmatizer = WordNetLemmatizer()
      words = [lemmatizer.lemmatize(w) for w in words]

      return " ".join(words)

  vectorizer = TfidfVectorizer()
  for url in pdf_url:
    text = " "
    try:
            response = requests.get(url)
            response.raise_for_status()  # Raises HTTPError for bad status
    except requests.HTTPError as e:
            print(f"Skipping {url}: {e}")
            continue # Check for HTTP errors
    filename = Path(url).name + ".pdf"  # e.g., "1710.02318v1.pdf"
    with open(filename, "wb") as f:
      f.write(response.content)
    with open(filename, "rb") as f:
      reader = PyPDF2.PdfReader(f)
      for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
          # Clean the text (remove page numbers at the end)
          page_text = re.sub(r'\s*\d+\s*$', ' ', page_text)
          text += page_text + "\n"  # Add a newline between pages
    text = clean_research_paper(text)
    docs.append(text)

  processed1 = preprocess(ctext)
  process = []
  for doc in docs:
      proc = {}
      processed2 = preprocess(doc)
      tfidf_matrix = vectorizer.fit_transform([processed1, processed2])
      similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
      print(f"Cosine Similarity: {similarity:.2f}")
      proc['similarity_ score'] = similarity

      process.append(proc)
  return process
def extract_title(text):
    # Match text before "Abstract", allowing for line breaks
    match = re.search(r'^(.*?)\bAbstract\b', text, re.DOTALL | re.IGNORECASE)
    if match:
        # Clean up: remove extra whitespace and join lines
        title = ' '.join(match.group(1).split()).strip()
        return title
    return None
def arxiv_query_url_from_title(title: str, max_results: int = 5, search_field: str = "all") -> str:
  encoded_title = urllib.parse.quote_plus(title.strip())
  base_url = "http://export.arxiv.org/api/query"
  query = f"{base_url}?search_query={search_field}:{encoded_title}&max_results={max_results}"
  return query

In [None]:
import urllib, urllib.request
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import re # Import the re module

# Use your already-configured GOOGLE_API_KEY (env or secret manager)
embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",          # or "text-embedding-004"
    task_type="RETRIEVAL_DOCUMENT"         # or "RETRIEVAL_QUERY"
)
dim = 1536
vectorstore = FAISS.from_documents(all_chunks, embedding,distance_strategy="DOT_PRODUCT")
vectorstore.save_local("/content/faiss_index")
vectorstore = FAISS.load_local("/content/faiss_index", embedding,allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_type="similarity", k=3)
class ResearchPaper(TypedDict):
  messages: str
  arxiv_results : list
  reason: str
  retrieved_docs: list
  conference:str
def retrieve_step(state):
    query = state["messages"]
    # Clean the query string
    query = re.sub(r'\s+', ' ', query).strip() # Replace multiple whitespace with single space and strip
    query = re.sub(r'[^\x00-\x7F]+', '', query) # Remove non-ASCII characters
    query = clean_research_paper(query)
    docs = retriever.invoke(query)
    state["retrieved_docs"]= docs
    return state

def result_fetching(state):
  docs = state['messages']
  title = extract_title(docs)

  url = arxiv_query_url_from_title(title, max_results=5, search_field="all")
  data = urllib.request.urlopen(url)
  ymll = data.read().decode('utf-8')

  pdf_url = extract_url(ymll)
  titles,abs,cats = extract_rel(ymll)
  process = complete_operation(pdf_url,docs)
  for proc,tit,abs,cat in zip(process,titles,abs,cats):
    proc['title'] = tit
    proc['abstract'] = abs
    proc['category'] = cat
  state["arxiv_results"] = process
  return state
def llms(state):
  docs = state["retrieved_docs"]
  docss = state["arxiv_results"]
  system_msg = '''from the given information  predict the conference out of TMLR,KDD,EMNLP,CVPR,NeurIPS and also give tell why this conference.

                  '''
  user_msg = f'''first analyze {docs} and then analyze {docss} and then predict the conference .
                 display the name of the conference only.
                  '''
  response = llm.invoke([{"role": "system", "content": system_msg},
                         {"role": "user",   "content": user_msg}])
  state["conference"] = response.content

  return state
graphs_builder = StateGraph(ResearchPaper)
graphs_builder.add_node("retrieve_step",retrieve_step)
graphs_builder.set_entry_point("retrieve_step")
graphs_builder.add_node("llms",llms)
graphs_builder.add_node("result_fetching",result_fetching)
graphs_builder.add_edge("retrieve_step","result_fetching")
graphs_builder.add_edge("result_fetching","llms")
graphs_builder.add_edge("llms", END)
graphs = graphs_builder.compile()
def get_reason_text(conf): # Renamed from reason
  if(conf=="CVPR"):
    dfg = '''focuses on computer vision tasks: image classification, object detection, segmentation, 3D reconstruction, video analysis, etc.
            It proposes new vision models or datasets.It includes strong empirical results on vision benchmarks (like ImageNet, COCO, etc.).'''
  elif(conf=="KDD"):
    dfg = '''It emphasizes data mining, large-scale data analysis, or real-world data applications.
           It often bridges theory and applied machine learning for massive datasets.
           Strong industrial or data-driven focus is common. '''
  elif(conf=="TMLR"):
    dfg = '''It is a general machine learning paper with strong theoretical or empirical contributions.'''
  elif(conf=="EMNLP"):
    dfg = '''It focuses on natural language processing: syntax, semantics, text generation, sentiment analysis, etc.
             It has strong empirical grounding (usually large-scale experiments on NLP tasks).
             Often includes new datasets, benchmarks, or language models.  '''
  else :
    dfg = ''' It contributes to foundational machine learning, AI, or optimization.
             It can be theoretical or applied but must show novelty and general relevance.
              It may cross disciplines: neuroscience, cognitive science, economics, etc.  '''
  return dfg

In [None]:
name =[]
conference = []
conference_reasons = [] # Renamed from reasons
for i in range(1,11):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
  conference=[]
  name=[]
  conference_reasons=[] # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

for i in range(12,16):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
 # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")


In [None]:
for i in range(16,18):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
 # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in range(19,20):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
 # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in range(20,29):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
  conference=[]
  name=[]
  conference_reasons=[] # Renamed from reasons
  if(rty==0):

         name.append(fname)
         conf="NA"
         conference.append(conf)
         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )

         name.append(fname)
         conf=dfg["conference"]
         conference.append(conf)
         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in range(30,51):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
  conference=[]
  name=[]
  conference_reasons=[] # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in range(51,56):
  if(i==60 or i==78):
    continue
  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
 # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in range(56,91):
  if(i==60 or i==78):
    continue
  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])

  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")


In [None]:
for i in range(92,136):

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])

  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]

         conference_reasons.append(get_reason_text(conf)) # Updated function call
         print(f"{fname} belongs to {conf}")

In [None]:
for i in [29,60]:

  docs2=" "
  base_dir = "/content"
  fname = f"P{i:03}.pdf"
  os.path.join(base_dir, fname)
  loader = PyPDFLoader(os.path.join(base_dir, fname))
  docs = loader.load()
  for doc in docs:
       docs2 = docs2 + doc.page_content
  for step in graph.stream({
    "messages": [],
    "paper_content": docs2,
    "search_results": [],
    "scores": {},
    "red_result": " "
})  :
     print(step)
  div = step["finalscore"]["scores"]
  data = np.array([div['methodology'],div['technical_soundness'],div['factual_correctness'],div['rigorousity'],div['novelty']
                            ,div['results'],div['clarity'],div['ethics'], div['total']]   )
  rty = rf_classifier.predict([data])
 # Renamed from reasons
  if(rty==0):


         conf="NA"

         print(f"{fname} belongs to {conf}")
  else:

         dfg =  graphs.invoke( {
           "messages": docs2,
           "arxiv_results": [ ],
            "imp_points" : " ",
            "retrieved_docs": [],
             "conference":" "
            } )


         conf=dfg["conference"]
 # Updated function call
         print(f"{fname} belongs to {conf}")