In [1]:
# Section check

import PyPDF2
import re

SECTION_SYNONYMS = {
    "abstract": ["abstract", "summary"],
    "methodology": ["methodology", "approach", "methods", "experimental setup", "methodologies", "method"],
    "results": ["results", "findings", "outcome", "discussion", "conclusions", "conclusion"],
}

def extract_text_from_pdf(file_path):
    
    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            text = [page.extract_text() for page in reader.pages]
        return "\n".join(text)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def check_section_presence(text, section_synonyms):
    
    section_presence = {section: 0 for section in section_synonyms}
    for section, synonyms in section_synonyms.items():
        # Check if any synonym for the section exists in the text
        if any(re.search(rf"\b{synonym}\b", text, re.IGNORECASE) for synonym in synonyms):
            section_presence[section] = 1
    return section_presence

def evaluate_scientific_rigor(section_presence):
    
    if all(value == 1 for value in section_presence.values()):
        return 1  # Complete
    return 0  # Incomplete

pdf_path = r"C:\Users\Radhika\Downloads\labelled papers\R002.pdf"
text = extract_text_from_pdf(pdf_path)

# Check for the presence of sections
section_presence = check_section_presence(text, SECTION_SYNONYMS)

# Evaluate scientific rigor
rigor_score = evaluate_scientific_rigor(section_presence)

# Output results
print("Section Presence:", section_presence)
print("Scientific Rigor Score (1 = Complete, 0 = Incomplete):", rigor_score)


Section Presence: {'abstract': 1, 'methodology': 1, 'results': 1}
Scientific Rigor Score (1 = Complete, 0 = Incomplete): 1


In [2]:
#Section check

import textstat
def clean_text(text):
    
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

def calculate_readability_scores(text):
    
    scores = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "gunning_fog": textstat.gunning_fog(text),
        "automated_readability_index": textstat.automated_readability_index(text),
    }
    return scores

def evaluate_readability_threshold(score, threshold=50):
    
    return "Pass" if score >= threshold else "Fail"


pdf_path = r"C:\Users\Radhika\Downloads\labelled papers\R002.pdf"  

# Step 1: Extract text from PDF
text = extract_text_from_pdf(pdf_path)  
text = clean_text(text)

# Step 2: Calculate readability scores
readability_scores = calculate_readability_scores(text)

# Step 3: Evaluate readability against the threshold
flesch_result = evaluate_readability_threshold(readability_scores["flesch_reading_ease"])

# Output results
print("Readability Scores:")
print(f"Flesch Reading Ease: {readability_scores['flesch_reading_ease']}")
print(f"Gunning Fog Index: {readability_scores['gunning_fog']}")
print(f"Automated Readability Index: {readability_scores['automated_readability_index']}")
print(f"Flesch Reading Ease Result: {flesch_result}")

Readability Scores:
Flesch Reading Ease: 12.03
Gunning Fog Index: 19.22
Automated Readability Index: 26.5
Flesch Reading Ease Result: Fail


In [None]:
from transformers import pipeline
import re
import PyPDF2

# Load a pre-trained model for sentence embeddings
nlp = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")

def extract_paragraphs_from_pdf(file_path):
    """
    Extract paragraphs from a PDF file.
    """
    try:
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            text = [page.extract_text() for page in reader.pages]
        # Join all text and split into paragraphs
        paragraphs = "\n".join(text).split("\n\n")
        return [para.strip() for para in paragraphs if para.strip()]
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return []

def split_into_sentences(paragraph):
    """
    Split a paragraph into sentences using regex.
    """
    return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', paragraph)

def evaluate_pdf_content(pdf_path):
    """
    Process paragraphs and sentences in a PDF.
    
    Args:
        pdf_path (str): Path to the input PDF.
    
    Returns:
        list: Extracted paragraphs with sentences split.
    """
    paragraphs = extract_paragraphs_from_pdf(pdf_path)
    processed_content = []
    
    for i, paragraph in enumerate(paragraphs):
        sentences = split_into_sentences(paragraph)
        processed_content.append({
            "Paragraph": f"Paragraph {i + 1}",
            "Sentences": sentences
        })

    return processed_content

# Example Usage
pdf_path = r"C:\Users\Radhika\Downloads\labelled papers\R002.pdf"  # Replace with the path to your PDF
processed_content = evaluate_pdf_content(pdf_path)

# Output processed content
print("Processed PDF Content:")
for content in processed_content:
    print(f"{content['Paragraph']}:")
    for sentence in content["Sentences"]:
        print(f"  - {sentence}")


Device set to use cpu


: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Sample Data (replace this with your actual dataset)
# Each row represents a paper with features (scores from checks) and label (publishable or not)
data = {
    "section_check": [0.9, 0.85, 0.7, 0.95, 0.4, 0.6, 0.8, 0.9, 0.7, 0.5, 0.65, 0.75, 0.88, 0.92, 0.3],
    "readability_score": [0.8, 0.82, 0.75, 0.9, 0.4, 0.6, 0.78, 0.88, 0.65, 0.5, 0.6, 0.7, 0.85, 0.91, 0.35],
    "coherence_score": [0.85, 0.87, 0.72, 0.93, 0.42, 0.62, 0.81, 0.89, 0.68, 0.55, 0.67, 0.78, 0.86, 0.94, 0.4],
    "novelty_score": [0.92, 0.88, 0.8, 0.96, 0.45, 0.65, 0.83, 0.9, 0.7, 0.52, 0.63, 0.74, 0.9, 0.95, 0.38],
    "label": [1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0],  # 1: Publishable, 0: Not Publishable
}

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

# Separate features (X) and labels (y)
X = df.drop("label", axis=1)
y = df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print Results
print("Accuracy of Logistic Regression Model:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Optional: Check model coefficients to understand feature importance
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

print("\nFeature Importance:")
print(feature_importance)
