<a href="https://colab.research.google.com/github/springboardmentor3847a-cloud/AI-System-to-Automatically-Review-and-Summarize-Research-Papers-/blob/sravanipemmasani/Module_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install semanticscholar python-dotenv requests -q

In [2]:
import json
import os
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

In [3]:
def setup_api_key():
    """Set up API key either from .env file or directly"""
    # Method 1: Try loading from .env file
    load_dotenv()
    API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # Method 2: If not in .env, use direct key
    if not API_KEY:
        # Create .env file with your API key
        with open(".env", "w") as f:
            f.write("SEMANTIC_SCHOLAR_API_KEY=83rBkeaXb14D8vGpXJezU6nrCFFmyn5L8RCvT9MM\n")
        load_dotenv()
        API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # Initialize Semantic Scholar
    if API_KEY:
        sch = SemanticScholar(api_key=API_KEY)
        print("Semantic Scholar initialized with API key")
    else:
        sch = SemanticScholar()
        print(" Using Semantic Scholar without API key (limited rate)")

    return sch


In [15]:
def search_papers(topic, limit=20):
    """
    Search Semantic Scholar for papers on a given topic
    Returns: Dictionary with search results
    """
    print(f"\n Searching for papers on: '{topic}'")
    print(f"   Requesting {limit} papers from Semantic Scholar...")

    sch = setup_api_key()

    try:
        # Search for papers
        results = sch.search_paper(
            query=topic,
            limit=limit,
            fields=["paperId", "title", "abstract", "year", "authors",
                   "citationCount", "openAccessPdf", "url", "venue"]
        )

        papers = []
        for paper in results:
            paper_data = {
                "title": paper.title,
                "authors": [author['name'] for author in paper.authors] if paper.authors else [],
                "year": paper.year,
                "paperId": paper.paperId,
                "abstract": paper.abstract[:300] + "..." if paper.abstract else "No abstract available",
                "citationCount": paper.citationCount,
                "venue": paper.venue if hasattr(paper, 'venue') else None,
                "url": paper.url,
                "pdf_url": paper.openAccessPdf['url'] if paper.openAccessPdf else None,
                "has_pdf": bool(paper.openAccessPdf)
            }
            papers.append(paper_data)

        # Calculate statistics
        papers_with_pdf = sum(1 for p in papers if p["has_pdf"])

        print(f"Search complete!")
        print(f"   Total papers found: {len(papers)}")
        print(f"   Papers with PDF available: {papers_with_pdf}")

        return {
            "topic": topic,
            "search_timestamp": "timestamp_placeholder",
            "total_results": len(papers),
            "papers_with_pdf": papers_with_pdf,
            "papers": papers
        }

    except Exception as e:
        print(f" Error searching papers: {e}")
        return None

In [16]:
def save_search_results(data, filename=None):
    """
    Save search results to JSON file
    """
    if not filename:
        # Create filename from topic
        safe_topic = "".join(c for c in data["topic"] if c.isalnum() or c == " ").replace(" ", "_")
        filename = f"paper_search_results_{safe_topic}.json"

    os.makedirs("data/search_results", exist_ok=True)
    filepath = os.path.join("data/search_results", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f" Search results saved to: {filepath}")
    return filepath

In [17]:
import pandas as pd

def display_results_table(data):
    """
    Convert search results into a clean, readable table using pandas.
    """
    if not data or "papers" not in data:
        print("No data to display in table")
        return

    table_data = []

    for p in data["papers"]:
        table_data.append({
            "Title": p["title"],
            "Authors": ", ".join(p["authors"][:3]) + ("..." if len(p["authors"]) > 3 else ""),
            "Year": p["year"],
            "Citations": p["citationCount"],
            "PDF": "Yes" if p["has_pdf"] else "No",
            "Venue": p["venue"]
        })

    df = pd.DataFrame(table_data)

    print("\n" + "="*80)
    print("TABLE VIEW OF RESULTS")
    print("="*80)
    display(df)

    return df


In [20]:
def main_search():
    """
    Main function for Module 1: Get topic and search for papers
    """
    print("\n" + "="*80)
    print("MODULE 1: TOPIC INPUT & PAPER SEARCH")
    print("="*80)

    # Get topic from user
    topic = input("\nEnter research topic: ").strip()
    if not topic:
        topic = "machine learning"  # Default topic

    # Search for papers
    results = search_papers(topic, limit=20)

    if results:
        # Save results
        save_path = save_search_results(results)

        # Display results
        display_results_table(results)

        print(f"\n Module 1 complete! Results saved to: {save_path}")

        return results, save_path
    else:
        print(" No results found. Please try a different topic.")
        return None, None

# Run Module 1 directly if needed
if __name__ == "__main__":
    main_search()


MODULE 1: TOPIC INPUT & PAPER SEARCH

Enter research topic: Alzheimer Detection and Classification Using SVM

 Searching for papers on: 'Alzheimer Detection and Classification Using SVM'
   Requesting 20 papers from Semantic Scholar...
Semantic Scholar initialized with API key
Search complete!
   Total papers found: 1000
   Papers with PDF available: 1000
 Search results saved to: data/search_results/paper_search_results_Alzheimer_Detection_and_Classification_Using_SVM.json

TABLE VIEW OF RESULTS


Unnamed: 0,Title,Authors,Year,Citations,PDF,Venue
0,Alzheimer Detection and Classification Using S...,"Sanchit Vashisht, Bhanu Sharma",2024.0,1,Yes,2024 IEEE International Conference on Informat...
1,Alzheimer Disease Detection of 3D-CNN with SE-...,Et. al R. Hemalatha,2023.0,1,Yes,International Journal on Recent and Innovation...
2,Application of Convolutional Neural Networks f...,"Kumar Swarnkar, Dr.Rajkumar Jhapte, Dr. Abhish...",2024.0,2,Yes,Journal of Electrical Systems
3,Using SVM for Alzheimer’s Disease detection fr...,"R. Kumari, Shivani Goel, Subhranil Das",2022.0,4,Yes,2022 IEEE 21st Mediterranean Electrotechnical ...
4,Speech-based detection of multi-class Alzheime...,"Tripti Tripathi, Rakesh Kumar",2023.0,22,Yes,International Journal of Data Science and Anal...
...,...,...,...,...,...,...
995,Functional and operatorial statistics,"S. Dabo‐Niang, F. Ferraty",2008.0,39,Yes,
996,Multivariate profiling of neurodegeneration-as...,"S. K. Kumarasamy, Yunshi Wang, Vignesh Viswana...",2008.0,3,Yes,BioData Mining
997,"Evolutionary Multi-Criterion Optimization, 5th...",,2009.0,6,Yes,International Conference on Evolutionary Multi...
998,Artificial Neural Networks: Biological Inspira...,"Wlodzislaw Duch, J. Kacprzyk, E. Oja...",2005.0,70,Yes,International Conference on Artificial Neural ...



 Module 1 complete! Results saved to: data/search_results/paper_search_results_Alzheimer_Detection_and_Classification_Using_SVM.json


In [9]:
!pip install ImageHash


Collecting ImageHash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ImageHash
Successfully installed ImageHash-4.3.2


In [13]:
# removing the duplicates form the dataset
import os
import imagehash
from PIL import Image
from glob import glob

def remove_duplicate_images(folder_paths):
    """
    Remove duplicate images from given folders based on perceptual hash.
    Keeps the first image and deletes subsequent duplicates.
    """
    seen_hashes = {}
    removed = 0

    for folder in folder_paths:
        print(f"\nChecking folder: {folder}")

        image_files = glob(os.path.join(folder, "*"))

        for img_path in image_files:
            try:
                img = Image.open(img_path)

                # Compute perceptual hash
                img_hash = imagehash.phash(img)

                if img_hash in seen_hashes:
                    # Duplicate found → remove file
                    os.remove(img_path)
                    removed += 1
                    print(f"Removed duplicate: {img_path}")
                else:
                    seen_hashes[img_hash] = img_path

            except Exception as e:
                print(f"Error reading {img_path}: {e}")

    print(f"\nTotal duplicates removed: {removed}")


# Folders to check
folders = [
    "/content/dataset/Combined Dataset/Very_Mild_Demented",
    "/content/dataset/Combined Dataset/Mild_Demented",
    "/content/dataset/Combined Dataset/Moderate_Demented",
    "/content/dataset/Combined Dataset/Non_Demented"
]

# Call the function
remove_duplicate_images(folders)



Checking folder: /content/dataset/Combined Dataset/Very_Mild_Demented

Checking folder: /content/dataset/Combined Dataset/Mild_Demented

Checking folder: /content/dataset/Combined Dataset/Moderate_Demented

Checking folder: /content/dataset/Combined Dataset/Non_Demented

Total duplicates removed: 0
