# Table 3 code, to scrape data from Google

In [None]:
import requests
import pandas as pd
import threading
import re
from datetime import datetime
import os

# API key for SerpApi
serpapi_api_key = 'b5f1b87f8b27028ec463904255cd4c96250d67e4d7f51e602956b5725d857ef5'

# Shared list for Google Scholar articles
articles_google_scholar = []

# Define the output directory
output_dir = r"Your Directory"

os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

# Lock for thread-safe operations on shared variables
lock = threading.Lock()

# Function to fetch Google Scholar pages with a max limit of 100 results
def fetch_google_scholar_pages():
    global articles_google_scholar

    print("Fetching article data from Google Scholar...")

    base_url = "https://serpapi.com/search.json"

    query = '(("Large Language Models" OR "LLM" OR "Artificial Intelligence") AND ("Software Project Management" OR "Software Engineering Management" OR "Software Project Development" OR "IT Project Management" OR "Software Lifecycle Management" OR "Project Management Framework"))'

    start = 0
    page_size = 10  # Number of articles per page
    max_results = 1000  # Maximum number of articles to fetch
    total_fetched = 0  # Counter for articles fetched
    has_more_results = True  # Flag to control the loop

    while total_fetched < max_results and has_more_results:
        params = {
            "engine": "google_scholar",
            "q": query,
            "api_key": serpapi_api_key,
            "start": start,
            "as_vis": 1,  # Limit results to scholarly articles
        }

        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            results = response.json()

            if 'organic_results' in results and results['organic_results']:
                fetched_any = False

                for article in results['organic_results']:
                    if total_fetched >= max_results:
                        break  # Stop fetching if we have reached the limit

                    title = article.get('title', 'No title available')
                    link = article.get('link', 'No link available')
                    snippet = article.get('snippet', 'No snippet available')  # Use snippet as a proxy for abstract
                    cited_by_count = article.get('inline_links', {}).get('cited_by', {}).get('total', 0)
                    authors_summary = article.get('publication_info', {}).get('summary', 'Not provided')
                    year_match = re.search(r'(\b(20|d{2})\|19|d{2})\b', authors_summary)
                    year = int(year_match.group(0)) if year_match else datetime.now().year

                    articles_google_scholar.append({
                        'title': title,
                        'authors': authors_summary,
                        'source': 'Google Scholar via SerpApi',
                        'year': year,
                        'doi': link,  # Google Scholar doesn't provide DOIs, using link instead
                        'abstract': snippet  # Ensure snippet is taken directly from Google Scholar result
                    })

                    total_fetched += 1
                    fetched_any = True

                if fetched_any:
                    start += page_size  # Move to the next page of results
                else:
                    print("No more new results available.")
                    has_more_results = False
            else:
                print("No more results available.")
                has_more_results = False
        else:
            print(f"Failed to fetch data for start={start} from Google Scholar: {response.status_code}, {response.text}")
            has_more_results = False

    with lock:
        df_google_scholar = pd.DataFrame(articles_google_scholar)
        df_google_scholar.to_csv(os.path.join(output_dir, "TIP_Google_Scholar_Results.csv"), index=False)

    print("Google Scholar data fetched and saved successfully.")

# Running the fetch using threading
thread = threading.Thread(target=fetch_google_scholar_pages)
thread.start()
thread.join()

Fetching article data from Google Scholar...
No more results available.
Google Scholar data fetched and saved successfully.


# Table 4 code to clean Table 3 data

In [None]:
import pandas as pd
import re
import os


def filter_dois(input_csv_path, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Define output file paths
    output_with_dois_csv_path = os.path.join(output_dir, "TIP_google_DOIs.csv")
    output_without_dois_csv_path = os.path.join(output_dir, "TIP_google_Without_DOIs.csv")
    output_removed_csv_path = os.path.join(output_dir, "TIP_google_Removed_articles.csv")
    output_filtered_csv_path = os.path.join(output_dir, "TIP_google_combined.csv")

    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_csv_path)

    # Filter out entries containing 'book', 'report', 'chapter', 'thesis', or 'news' in the 'doi' column
    removal_pattern = r'book|report|chapter|thesis|news'
    removed_df = df[df['doi'].str.contains(removal_pattern, na=False, case=False)]
    remaining_df = df[~df['doi'].str.contains(removal_pattern, na=False, case=False)]

    # Filter based on the presence of DOIs in the remaining entries
    doi_pattern = r'10\.'  # DOIs typically start with "10."
    with_dois_df = remaining_df[remaining_df['doi'].str.contains(doi_pattern, na=False)]
    without_dois_df = remaining_df[~remaining_df['doi'].str.contains(doi_pattern, na=False)]

    # Save the filtered DataFrame with DOIs to a new CSV file
    with_dois_df.to_csv(output_with_dois_csv_path, index=False)

    # Further filter out entries with government URLs in the remaining entries without DOIs
    gov_pattern = r'\.gov|\.uk|\.au|\.ca'
    removed_gov_df = without_dois_df[without_dois_df['doi'].str.contains(gov_pattern, na=False, case=False)]
    cleaned_without_dois_df = without_dois_df[~without_dois_df['doi'].str.contains(gov_pattern, na=False, case=False)]

    # Combine all removed entries into a single DataFrame
    all_removed_df = pd.concat([removed_df, removed_gov_df])

    # Save the remaining entries without DOIs to a new CSV file
    cleaned_without_dois_df.to_csv(output_without_dois_csv_path, index=False)

    # Save all removed entries to a new CSV file
    all_removed_df.to_csv(output_removed_csv_path, index=False)

    # Combine entries with DOIs and cleaned entries without DOIs
    filtered_articles_df = pd.concat([with_dois_df, cleaned_without_dois_df])

    # Save the combined filtered articles to a new CSV file
    filtered_articles_df.to_csv(output_filtered_csv_path, index=False)

    print(f"Entries with DOIs saved to {output_with_dois_csv_path}")
    print(f"Entries without DOIs saved to {output_without_dois_csv_path}")
    print(f"Removed entries saved to {output_removed_csv_path}")
    print(f"Combined DOI/without DOI articles saved to {output_filtered_csv_path}")


# Define the input file path
input_csv_path = r"/content/Your_Directory/TIP_Google_Scholar_Results.csv"
output_dir = r"Cleaned"

# Call the function to filter entries with and without DOIs, and remove specific entries
filter_dois(input_csv_path, output_dir)

Entries with DOIs saved to Cleaned/TIP_google_DOIs.csv
Entries without DOIs saved to Cleaned/TIP_google_Without_DOIs.csv
Removed entries saved to Cleaned/TIP_google_Removed_articles.csv
Combined DOI/without DOI articles saved to Cleaned/TIP_google_combined.csv


**Code to Scrape full abstracts for Google Scholar data.**

In [None]:
!pip install openpyxl beautifulsoup4



In [None]:
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup
from urllib.parse import quote
import os

# Configuration
input_file = r"Full_Abstract_scraping_for_google.xlsx"
output_file = r"Full_Abstracts.xlsx"

# Function to extract DOI from URL
def extract_doi(url):
    """Extract DOI from a URL if present"""
    if pd.isna(url) or url == 'No link available':
        return None
    doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+'
    match = re.search(doi_pattern, str(url), re.IGNORECASE)
    return match.group(0) if match else None

# Function to get abstract from CrossRef using DOI
def get_abstract_from_crossref(doi):
    """Fetch abstract from CrossRef API using DOI"""
    try:
        url = f"https://api.crossref.org/works/{quote(doi, safe='')}"
        headers = {'User-Agent': 'Mozilla/5.0 (Educational Research; mailto:research@example.com)'}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            data = response.json()
            abstract = data.get('message', {}).get('abstract', '')
            if abstract:
                # Remove XML tags if present
                abstract = re.sub(r'<[^>]+>', '', abstract)
                return abstract.strip()
    except Exception as e:
        print(f"  CrossRef error for DOI {doi}: {e}")
    return None

# Function to get abstract from Semantic Scholar
def get_abstract_from_semantic_scholar(title):
    """Fetch abstract from Semantic Scholar API using paper title"""
    try:
        url = "https://api.semanticscholar.org/graph/v1/paper/search"
        params = {
            'query': title,
            'limit': 1,
            'fields': 'abstract,title'
        }
        headers = {'User-Agent': 'Mozilla/5.0 (Educational Research)'}

        response = requests.get(url, params=params, headers=headers, timeout=10)

        if response.status_code == 200:
            data = response.json()
            if data.get('data') and len(data['data']) > 0:
                paper = data['data'][0]
                abstract = paper.get('abstract', '')
                if abstract and len(abstract) > 100:
                    return abstract.strip()
    except Exception as e:
        print(f"  Semantic Scholar error: {e}")
    return None

# Function to scrape abstract from paper URL
def scrape_abstract_from_url(url):
    """Attempt to scrape abstract directly from paper URL"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Try common abstract selectors
            abstract_selectors = [
                {'name': 'meta', 'attrs': {'name': 'citation_abstract'}},
                {'name': 'meta', 'attrs': {'name': 'description'}},
                {'name': 'meta', 'attrs': {'property': 'og:description'}},
                {'name': 'meta', 'attrs': {'name': 'DC.Description'}},
                {'name': 'div', 'attrs': {'class': re.compile(r'abstract', re.I)}},
                {'name': 'section', 'attrs': {'class': re.compile(r'abstract', re.I)}},
                {'name': 'div', 'attrs': {'id': re.compile(r'abstract', re.I)}},
                {'name': 'section', 'attrs': {'id': re.compile(r'abstract', re.I)}},
            ]

            for selector in abstract_selectors:
                element = soup.find(**selector)
                if element:
                    if element.name == 'meta':
                        abstract = element.get('content', '')
                    else:
                        abstract = element.get_text(strip=True)
                        # Remove common abstract headers
                        abstract = re.sub(r'^abstract:?\s*', '', abstract, flags=re.IGNORECASE)

                    if abstract and len(abstract) > 100:
                        return abstract
    except Exception as e:
        print(f"  Scraping error: {e}")
    return None

# Function to get full abstract using multiple methods
def get_full_abstract(row):
    """Try multiple methods to get full abstract"""

    title = row['Title']
    url = row['DOI']
    current_abstract = row['Abstract']

    print(f"\nProcessing: {title[:60]}...")
    print(f"  Current abstract length: {len(str(current_abstract))} chars")

    # If current abstract is already long, keep it
    if len(str(current_abstract)) > 200:
        print(f"  ✓ Already has a good abstract, skipping")
        return current_abstract, "Original (sufficient)"

    # Method 1: Try DOI via CrossRef
    doi = extract_doi(url)
    if doi:
        print(f"  Trying CrossRef with DOI: {doi}")
        abstract = get_abstract_from_crossref(doi)
        if abstract and len(abstract) > len(str(current_abstract)):
            print(f"  ✓ Found via CrossRef ({len(abstract)} chars)")
            return abstract, "CrossRef"
        time.sleep(0.5)

    # Method 2: Try Semantic Scholar
    print(f"  Trying Semantic Scholar...")
    abstract = get_abstract_from_semantic_scholar(title)
    if abstract and len(abstract) > len(str(current_abstract)):
        print(f"  ✓ Found via Semantic Scholar ({len(abstract)} chars)")
        return abstract, "Semantic Scholar"
    time.sleep(0.5)

    # Method 3: Try scraping the URL
    if url and url != 'No link available':
        print(f"  Trying to scrape URL...")
        abstract = scrape_abstract_from_url(url)
        if abstract and len(abstract) > len(str(current_abstract)):
            print(f"  ✓ Found by scraping ({len(abstract)} chars)")
            return abstract, "Scraped from URL"
        time.sleep(1)

    # Fallback to original
    print(f"  ⚠ Keeping original abstract")
    return current_abstract, "Original (no better found)"

# Main processing function
def enrich_abstracts():
    print("="*70)
    print("ABSTRACT ENRICHMENT TOOL")
    print("="*70)

    # Read the Excel file
    print(f"\nReading Excel file: {input_file}")
    df = pd.read_excel(input_file)
    print(f"Found {len(df)} articles")

    # Check required columns
    required_columns = ['Title', 'Abstract', 'DOI']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Error: Missing columns: {missing_columns}")
        print(f"Available columns: {list(df.columns)}")
        return

    # Add new columns for enriched data
    df['Full_Abstract'] = ''
    df['Abstract_Source'] = ''

    # Process each row
    total = len(df)
    for idx, row in df.iterrows():
        print(f"\n[{idx+1}/{total}]", end=" ")
        try:
            full_abstract, source = get_full_abstract(row)
            df.at[idx, 'Full_Abstract'] = full_abstract
            df.at[idx, 'Abstract_Source'] = source
        except Exception as e:
            print(f"  ✗ Error: {e}")
            df.at[idx, 'Full_Abstract'] = row['Abstract']
            df.at[idx, 'Abstract_Source'] = "Error"

    # Save the enriched data to Excel
    df.to_excel(output_file, index=False)

    # Print statistics
    print("\n" + "="*70)
    print("ENRICHMENT STATISTICS")
    print("="*70)
    print("\nAbstract Sources:")
    print(df['Abstract_Source'].value_counts())

    print("\nAbstract Length Statistics:")
    print(f"Original average: {df['Abstract'].str.len().mean():.0f} characters")
    print(f"Enriched average: {df['Full_Abstract'].str.len().mean():.0f} characters")

    # Count improvements
    df['improved'] = df['Full_Abstract'].str.len() > df['Abstract'].str.len()
    improved_count = df['improved'].sum()
    print(f"\nImproved abstracts: {improved_count} out of {total} ({improved_count/total*100:.1f}%)")

    print(f"\n✓ Enriched data saved to: {output_file}")
    print("="*70)

# Run the enrichment
if __name__ == "__main__":
    try:
        enrich_abstracts()
    except FileNotFoundError:
        print(f"Error: Could not find input file: {input_file}")
        print("Please update the 'input_file' path in the script.")
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

ABSTRACT ENRICHMENT TOOL

Reading Excel file: Full_Abstract_scraping_for_google.xlsx
Found 510 articles

[1/510] 
Processing: Towards LLM-augmented multiagent systems for agile software ...
  Current abstract length: 180 chars
  Trying CrossRef with DOI: 10.1145/3691620.3695336
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[2/510] 
Processing: Keynote on Augmented Agile: Human-centred AI-assisted Softwa...
  Current abstract length: 190 chars
  Trying CrossRef with DOI: 10.1145/3643665.3648567
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1011 chars)

[3/510] 
Processing: Future trends in IT project management–large organizations p...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1370 chars)

[4/510] 
Processing: Enhancing software effort estimation through reinforcement l...
  Current abstract length: 177 chars
  Trying CrossRef with DOI: 10.1108/ijmpb-03-2024-0065/full/html
  Tr



  ⚠ Keeping original abstract

[30/510] 
Processing: Quantitative software project management with mixed data: A ...
  Current abstract length: 177 chars
  Trying CrossRef with DOI: 10.1002/smr.2348
  ✓ Found via CrossRef (1247 chars)

[31/510] 
Processing: Proposition of the project management framework through inte...
  Current abstract length: 184 chars
  Trying CrossRef with DOI: 10.1504/IJPD.2009.024183
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[32/510] 
Processing: Innovative governance strategy to enhance the performance an...
  Current abstract length: 191 chars
  Trying CrossRef with DOI: 10.1504/IJPOM.2022.124128
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[33/510] 
Processing: Automation in Software Project Management...
  Current abstract length: 175 chars
  Trying CrossRef with DOI: 10.1108/978-1-80455-143-120231009/full/html
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keep



  ⚠ Keeping original abstract

[62/510] 
Processing: Research on blended learning mode based on network...
  Current abstract length: 180 chars
  Trying CrossRef with DOI: 10.1145/3447490.3447498
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[63/510] 
Processing: Requirement phase effort estimation using software metrics...
  Current abstract length: 189 chars
  Trying CrossRef with DOI: 10.1145/1741906.1742244
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[64/510] 
Processing: Beyond implementation: Driving CRM solution success in the e...
  Current abstract length: 179 chars
  Trying CrossRef with DOI: 10.1177/20438869251360807
  ✓ Found via CrossRef (1094 chars)

[65/510] 
Processing: Project scheduling conflict identification and resolution us...
  Current abstract length: 188 chars
  Trying CrossRef with DOI: 10.1007/s11235-011-9426-3
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping o



  ⚠ Keeping original abstract

[88/510] 
Processing: George Mason University...
  Current abstract length: 171 chars
  Trying CrossRef with DOI: 10.1007/978-3-642-77393-8_11.pdf
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[89/510] 
Processing: Introduction to the Methods Employed...
  Current abstract length: 181 chars
  Trying CrossRef with DOI: 10.1002/9781118574829.ch2
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[90/510] 
Processing: Scalability and ai: An insight on software project managemen...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (927 chars)

[91/510] 
Processing: Agile Project Management Using Large Language Models...
  Current abstract length: 173 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1304 chars)

[92/510] 
Processing: AI in software engineering: a survey on project manageme



  ⚠ Keeping original abstract

[126/510] 
Processing: Organising Projects for Responsible Use of Generative Artifi...
  Current abstract length: 194 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[127/510] 
Processing: AI-augmented agile project management in engineering: A fram...
  Current abstract length: 196 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (950 chars)

[128/510] 
Processing: Application of Natural Language Processing Techniques in Agi...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1446 chars)

[129/510] 
Processing: CONSTRUCTION OF MODELS AND APPLICATION OF SYNCRETIC INNOVATI...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1974 chars)

[130/510] 
Processing: Will Artificial Intelligence Replace Project Managers? A Cas...
  Current abstract length: 194 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[131/510] 
Processing: Artificial Intelligence in Project Management: Project Manag...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[132/510] 
Processing: Agile & AI in IT Project Management-How AI Can Drive IT Proj...
  Current abstract length: 201 chars
  ✓ Already has a good abstract, skipping

[133/510] 
Processing: Using agents for distributed software project management...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (693 chars)

[134/510] 
Processing: Explainable artificial intelligence (XAI) in project managem...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (2064 chars)

[135/510] 
Processing: … for Project Management of Developing a Safety-Oriented Log...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to sc



  ⚠ Keeping original abstract

[136/510] 
Processing: AI-Driven Sustainable Project Management Framework...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[137/510] 
Processing: A formal model for software project management...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[138/510] 
Processing: Advancements in Artificial Intelligence for Automated Softwa...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1062 chars)

[139/510] 
Processing: Software Project Management vs INGO Project Management Tools...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[140/510] 
Processing: Optimizing IT Governance and Project Management in Software ...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1282 chars)

[141/510] 
Processing: Examine the Potential of Artificial Intelligence in Enhancin...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[142/510] 
Processing: Enhancing team diversity with generative ai: A novel project...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1068 chars)

[143/510] 
Processing: Using GenAI in IT project management: case studies, insights...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[144/510] 
Processing: Towards a dynamic ontology based software project management...
  Current abstract length: 179 c



  ⚠ Keeping original abstract

[147/510] 
Processing: A conceptual framework for applying artificial intelligence ...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (975 chars)

[148/510] 
Processing: AI driven strategic decision-making in IT project management...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (2088 chars)

[149/510] 
Processing: Importance of artificial intelligence in technology project ...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1743 chars)

[150/510] 
Processing: Learning Software Project Management From Analyzing Q&A's in...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1324 chars)

[151/510] 
Processing: Enhancing Risk Identification in Software Project Management...
  Current abstract length: 182 chars
  Trying Semantic Sc



  ⚠ Keeping original abstract

[152/510] 
Processing: Artificial Intelligence in Cloud Security: Project Managemen...
  Current abstract length: 198 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[153/510] 
Processing: ADAPTING SOFTWARE PROJECT MANAGEMENT FOR SUSTAINABILITY, ENV...
  Current abstract length: 193 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[154/510] 
Processing: Ethical and Legal Considerations of AI in IT Project Managem...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1234 chars)

[155/510] 
Processing: OPTIMIZING RESOURCE ALLOCATION AND SCHEDULING STRATEGIES IN ...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (2126 chars)

[156/510] 
Processing: AI-Driven Project Management Systems: Enhancing IT Project E...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1274 chars)

[157/510] 
Processing: Navigating the Future of IT Project Management: From Global ...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1047 chars)

[158/510] 
Processing: The impact of artificial intelligence on project management:...
  Current abstract length: 182 chars
  



  ⚠ Keeping original abstract

[160/510] 
Processing: An LLM-based multi-agent framework for agile effort estimati...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1186 chars)

[161/510] 
Processing: On Characteristics and Process Requirements of Artificial In...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (908 chars)

[162/510] 
Processing: GenAI-Enabled Backlog Grooming in Agile Software Projects: A...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1009 chars)

[163/510] 
Processing: Exploring the Landscape of Multicriteria Decision Making in ...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[164/510] 
Processing: Symbio Coding: An Educational Testbed for AI Agent Simulatio...
  Current abstract length: 1



  ⚠ Keeping original abstract

[174/510] 
Processing: The influence of agile methodology (Scrum) on software proje...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (754 chars)

[175/510] 
Processing: Using AI to develop a framework to prevent employees from mi...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[176/510] 
Processing: Augmented agile: Human-centered ai-assisted software managem...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (624 chars)

[177/510] 
Processing: Impact of artificial intelligence (AI) on information techno...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[178/510] 
Processing: A Business Process Model for Augmented Artificial Intelligen...
  Current abstract length: 166 chars
  Trying Seman



  ⚠ Keeping original abstract

[179/510] 
Processing: Project Management Framework for Digitalization of Logistics...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1243 chars)

[180/510] 
Processing: Big data and cloud computing in IT project management: A fra...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[181/510] 
Processing: Exploration of artificial intelligence utilization potential...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (615 chars)

[182/510] 
Processing: An experience-based approach to software project management...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1146 chars)

[183/510] 
Processing: The last state of artificial intelligence in project managem...
  Current abstract length: 194 chars
  Trying 



  ⚠ Keeping original abstract

[200/510] 
Processing: Setting the Stage...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[201/510] 
Processing: Agentic ai software engineer: Programming with trust...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (748 chars)

[202/510] 
Processing: AI-Driven Budget Estimation in End-User Software Engineering...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[203/510] 
Processing: Perspectives on the adherance to scrum rules in software pro...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (248 chars)

[204/510] 
Processing: Modern convergent challenges in IT project management for CR...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[205/510] 
Processing: Estimation issues in software project management...
  Current abstract length: 181 chars
  Trying Semantic



  ⚠ Keeping original abstract

[206/510] 
Processing: The Influence of Artificial Intelligence on Project Manageme...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[207/510] 
Processing: Prototyping an ethical issue decision support tool for user ...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (221 chars)

[208/510] 
Processing: Evolutionary Pathway: Agile Frameworks In It Project Managem...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1365 chars)

[209/510] 
Processing: Combining techniques to optimize effort predictions in softw...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[210/510] 
Processing: Potential implications of artificial intelligence for projec...
  Current abstract length: 197 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[211/510] 
Processing: The Influence of Kanban Agile Methodology on Software Projec...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1412 chars)

[212/510] 
Processing: Applying Lean to improve software project management educati...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (992 chars)

[213/510] 
Processing: Software Project Management Challenges with Information Secu...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[214/510] 
Processing: Cybersecurity Transformation: Cyber-Resilient IT Project Man...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1538 chars)

[215/510] 
Processing: Monolithic ontological methodology (MOM): An effective softw...
  Current abstract length: 179 chars
  Trying Semantic Scholar



  ⚠ Keeping original abstract

[220/510] 
Processing: Innovation in IT project management for banking systems...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[221/510] 
Processing: Enhancing healthcare outcomes via agile IT project managemen...
  Current abstract length: 172 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[222/510] 
Processing: Critical IT project management competencies: Aligning instru...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[223/510] 
Processing: A Study on the effectiveness of intelligent decision support...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (248 chars)

[224/510] 
Processing: Context-aware mobile assistant agents in software project ma...
  Current abstract length: 197 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (810 chars)

[225/510] 
Processing: An integrating machine learning algorithm and simulation met...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  Scraping error: HTTPSConnectionPool(host='journals.shahed.ac.ir', port=443): Max retries exceeded with url: /article_3542.html (Caused by ConnectTimeo



  ⚠ Keeping original abstract

[233/510] 
Processing: PM-Net: a software project management representation model...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[234/510] 
Processing: A soft computing framework to evaluate the efficacy of softw...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (744 chars)

[235/510] 
Processing: An IT Project Management Framework for Assessing the Dynamis...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[236/510] 
Processing: Excellence in IT Project Management: Firing Agile Silver Bul...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[237/510] 
Processing: An evaluation of expert systems for software engineering man...
  Current abstract length: 181 chars
  T



  ⚠ Keeping original abstract

[250/510] 
Processing: Application of the cognitive approach for IT project managem...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (519 chars)

[251/510] 
Processing: Green Risk Management: Integrating Sustainability into IT Pr...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[252/510] 
Processing: BUSINESS ETHICS AND IT PROJECT MANAGEMENT IN THE PROCESS OF ...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (2064 chars)

[253/510] 
Processing: Behavioral pattern recognition and knowledge extraction for ...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (696 chars)

[254/510] 
Processing: An integrated multi-agent-based simulation approach to suppo...
  Current abstract length: 18



  ⚠ Keeping original abstract

[259/510] 
Processing: Object-oriented database support for software project manage...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[260/510] 
Processing: IT project management: Lessons learned from project retrospe...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[261/510] 
Processing: Higher-order learning outcomes in an undergraduate IT projec...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1072 chars)

[262/510] 
Processing: Design and development of an expert system based quality ass...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[263/510] 
Processing: Optimizing software project management staffing and work-for...
  Current abstract length: 177 chars
  Trying Semantic



  ⚠ Keeping original abstract

[265/510] 
Processing: Consulting the oracle: A future role for expert systems in I...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[266/510] 
Processing: Experiential and Peer Learning in an IT Project Management C...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[267/510] 
Processing: Fuzzy Logic System for IT Project Management....
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (246 chars)

[268/510] 
Processing: BASIC TECHNOLOGIES AND TECHNIQUES ML/AI FOR IMPROVING PHYSIC...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[269/510] 
Processing: Adapting IT Project Management Practices to Globally Distrib...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[270/510] 
Processing: The potential of artificial intelligence in it project portf...
  Current abstract length: 196 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1029 chars)

[271/510] 
Processing: An Ontology Model for Software Measurement in Software Proje...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[272/510] 
Processing: Adaptive Hybrid Software Project Management in Bangladesh's ...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[273/510] 
Processing: Digital Transformation in the USA Leveraging AI and Business...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1721 chars)

[274/510] 
Processing: An object-oriented formal model for software project managem...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1267 chars)

[275/510] 
Processing: Fuzzy Expert-COCOMO risk assessment and effort contingency m...
  Current abstract length: 165 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[276/510] 
Processing: Defect classification as problem classification for quality ...
  Current abstract length: 188 c



  ⚠ Keeping original abstract

[278/510] 
Processing: Software project management for software assurance...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[279/510] 
Processing: Applying gamification for mindset changing in automotive sof...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[280/510] 
Processing: Agile Methodologies in Software Project Management: Benefits...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1090 chars)

[281/510] 
Processing: Learning Software Project Management on the Web: The Impact ...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (960 chars)

[282/510] 
Processing: CS1021–SOFTWARE PROJECT MANAGEMENT...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[283/510] 
Processing: The client as a real option in successful software project m...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1240 chars)

[284/510] 
Processing: Inspirational intuition and innovation in it project managem...
  Current abstract length: 194 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (246 chars)

[285/510] 
Processing: Methods for Assessing the Effectiveness of IT Project Manage...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[286/510] 
Processing: Software project management net: a new methodology on softwa...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[287/510] 
Processing: Committee member...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to



  ⚠ Keeping original abstract

[293/510] 
Processing: Modern Metrics (MM): Software size estimation using function...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[294/510] 
Processing: Teaching simulations supported by artificial intelligence in...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[295/510] 
Processing: Large language models for issue report classification...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[296/510] 
Processing: Predictive Maintenance Strategies for Heat Exchangers Applie...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (939 chars)

[297/510] 
Processing: Performance evaluation of IT project management in developin...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1145 chars)

[298/510] 
Processing: Implementing Artificial Intelligence in IT Management: Oppor...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1304 chars)

[299/510] 
Processing: A model for enhancing software project management using soft...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[300/510] 
Processing: WAYS TO IMPROVE THE EFFICIENCY OF IT PROJECT MANAGEMENT PROC...
  Current abstract length: 190 chars
  Trying



  ⚠ Keeping original abstract

[305/510] 
Processing: Towards Next Generation Project Management Simulation...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[306/510] 
Processing: Software Team Formation for software project management by N...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[307/510] 
Processing: Rank–...
  Current abstract length: 197 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[308/510] 
Processing: Enhanced software project management by application of metri...
  Current abstract length: 170 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (547 chars)

[309/510] 
Processing: Machine Learning applied to manage effort estimation and req...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[310/510] 
Processing: IT project success: The evaluation of 142 success factors by...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[311/510] 
Processing: Project of an information system for restaurant business man...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[312/510] 
Processing: The project management practice of the development of the ap...
  Current abstract length: 170 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (505 chars)

[313/510] 
Processing: An Ant Colony Optimization Algorithm for Software Project Ma...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (463 chars)

[314/510] 
Processing: The state of knowledge and the use of creativity methods in ...
  Current abstract length: 184 cha



  ⚠ Keeping original abstract

[320/510] 
Processing: How Artificial Intelligence Influences Project Management...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[321/510] 
Processing: A novel prototype tool for intelligent software project sche...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1353 chars)

[322/510] 
Processing: Software project schedule management using machine learning ...
  Current abstract length: 194 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[323/510] 
Processing: Investigating Critical Success Factors for Effective Managem...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1401 chars)

[324/510] 
Processing: Coupling Artificial Intelligence and a System Dynamics Simul...
  Current abstract length: 180 char



  ⚠ Keeping original abstract

[334/510] 
Processing: Role of artificial intelligence in business process transfor...
  Current abstract length: 172 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[335/510] 
Processing: Web Application for Simulation of Agile Software Projects–St...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (989 chars)

[336/510] 
Processing: AI in Risk Management...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (2042 chars)

[337/510] 
Processing: Contructing and researching on software engineering course g...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (731 chars)

[338/510] 
Processing: Explicit specification framework to manage software project ...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape 



  ⚠ Keeping original abstract

[349/510] 
Processing: Consolidation of the ifm with the jssp through neural networ...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (827 chars)

[350/510] 
Processing: A SWOT Analysis of the Role of Artificial Intelligence in Pr...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[351/510] 
Processing: AI-Driven Project Management for Construction SMEs: A Framew...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1102 chars)

[352/510] 
Processing: Recommendation system for it software project planning: a hy...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (840 chars)

[353/510] 
Processing: Automatic Software Defect Assignment Method Based on Deep Le...
  Current abstract length: 18



  ⚠ Keeping original abstract

[355/510] 
Processing: Software project control: an experimental investigation of j...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1081 chars)

[356/510] 
Processing: Software engineering expert system for global development...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (460 chars)

[357/510] 
Processing: GENERATIVE AI IN ENGINEERING EDUCATION FOR LEARNING AND SUPP...
  Current abstract length: 193 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (2299 chars)

[358/510] 
Processing: Software project risk management modeling with neural networ...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1182 chars)

[359/510] 
Processing: Re-imagining Digital Transformation in the United States: Ha...
  Current abstract length:



  ⚠ Keeping original abstract

[365/510] 
Processing: Open source software to teach technology entrepreneurship co...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1663 chars)

[366/510] 
Processing: Bibliometric analysis of the application of artificial intel...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[367/510] 
Processing: An investigation of optimal project scheduling and team staf...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1198 chars)

[368/510] 
Processing: Examining the Impact of Artificial Intelligence Adoption on ...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1576 chars)

[369/510] 
Processing: Automatic Creation of a Countermeasure Plan against Process ...
  Current abstract length: 185 chars
  Tryin



  ⚠ Keeping original abstract

[370/510] 
Processing: Analyzing and Resolving Issues in Software Project Risk Mana...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[371/510] 
Processing: EFFECT OF SOFT SKILLS ON SUCCESS OF INFORMATION TECHNOLOGY P...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[372/510] 
Processing: CombiningKnowledge Based System withInformation Technology b...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (242 chars)

[373/510] 
Processing: DIGITALIZATION OF PROJECT MANAGEMENT IN THE IT SECTOR: CASE ...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (836 chars)

[374/510] 
Processing: Data science and artificial intelligence in project manageme...
  Current abstract length: 194 cha



  ⚠ Keeping original abstract

[375/510] 
Processing: The Role of Artificial Intelligence in Managing Scientific R...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[376/510] 
Processing: Advancing lean construction through Artificial Intelligence:...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[377/510] 
Processing: Analysis of project management software...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[378/510] 
Processing: Project Quality Management for the HL-LHC IT String...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1228 chars)

[379/510] 
Processing: MOK, Heng Ngee; and LAUW, Hady Wirawan. Towards automated sl...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[380/510] 
Processing: Estudio del Uso de Herramientas de IA para la Gestión de Pro...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1092 chars)

[381/510] 
Processing: Using blended project-based learning to teach project manage...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[382/510] 
Processing: An intelligent model for software project risk prediction...
  Current abstract length: 189 chars
  Trying Semantic Schola



  ⚠ Keeping original abstract

[386/510] 
Processing: Inferring Soft Skills from Developer Interactions: A Data-Dr...
  Current abstract length: 193 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[387/510] 
Processing: Refining the Scrum Paradigm: A Comprehensive Research of Sof...
  Current abstract length: 173 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[388/510] 
Processing: Implementing artificial intelligence tools for risk manageme...
  Current abstract length: 199 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1578 chars)

[389/510] 
Processing: Delving into human factors through lstm by navigating enviro...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1322 chars)

[390/510] 
Processing: Generative job recommendations with large language model...
  Current abstract length: 169 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1884 chars)

[391/510] 
Processing: Leveraging generative AI tools for UX design in lean and agi...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (842 chars)

[392/510] 
Processing: Bridging Disciplines through Visualization: Managing a VR Mu...
  Current abstract length: 194 chars
  Trying Semantic Schola



  ⚠ Keeping original abstract

[393/510] 
Processing: A 4-Layered Plan-Driven Model (4LPdM) to Improve Software De...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[394/510] 
Processing: Digital Twins and BIM Technologies for Industrial Systems Ma...
  Current abstract length: 196 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1281 chars)

[395/510] 
Processing: On multi-criteria decision-making methods in finance using e...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1107 chars)

[396/510] 
Processing: A new organizational chassis for artificial intelligence-exp...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[397/510] 
Processing: Representing IT projects risk management best practices as a...
  Current abstract length: 178 chars
  Trying Sem



  ⚠ Keeping original abstract

[403/510] 
Processing: INFORMATION TECHNOLOGIES...
  Current abstract length: 196 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[404/510] 
Processing: UNVEILING KEY FACTORS FOR SUCCESSFUL IT PROJECTS IN KAZAKHST...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[405/510] 
Processing: Artificial Intelligence in Information Technology Tools and ...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[406/510] 
Processing: Project planning and risk management as a success factor for...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[407/510] 
Processing: Methodological Approach for Choosing Type of IT Projects Man...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (660 chars)

[408/510] 
Processing: Project Management in the Age of Artificial Intelligence...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[409/510] 
Processing: Odessa II Mechnikov National University, Odesa, Ukraine e-ma...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[410/510] 
Processing: Strategizing VR Integration in Business and Education: Exten...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[411/510] 
Processing: Multi-Criteria Resource Allocation Approach to Artificial In...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[412/510] 
Processing: Single goal set: A new paradigm for IT megaproject success...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (828 chars)

[413/510] 
Processing: The effects of project management certification on the tripl...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[414/510] 
Processing: A Proportional Analysis Study on Function Point Based Softwa...
  Current abstract length: 184 chars
  T



  ⚠ Keeping original abstract

[416/510] 
Processing: Modelling Software Development Effort Using Data-Driven Mode...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[417/510] 
Processing: Multi-Agent Spiral Software Engineering: a lakatosian approa...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[418/510] 
Processing: Improving Dijkstra's algorithm for Estimating Project Charac...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1344 chars)

[419/510] 
Processing: Role of Green Project Management on Construction Industry...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[420/510] 
Processing: Constraint-based recommendation for software project effort ...
  Current abstract length: 176 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[421/510] 
Processing: A Lakatosian Approach to Agent Software Engineering...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (242 chars)

[422/510] 
Processing: The evident use of evidence theory in big data analytics usi...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1076 chars)

[423/510] 
Processing: AI AND BUSINESS ANALYTICS IN POST-PANDEMIC US DIGITAL TRANSF...
  Current abstract length: 184 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[424/510] 
Processing: A hybrid fuzzy-Ann approach For software effort estimation...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (249 chars)

[425/510] 
Processing: Research on Measuring Method of Function Size of Information...
  Current abstract length: 187 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (574 chars)

[426/510] 
Processing: Digital management methodology for building production optim...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1410 chars)

[427/510] 
Processing: Question Crafting System for Personalized Learning using Lar...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1025 chars)

[428/510] 
Processing: INTEGRATING NETWORK INTRUSION DETECTIONWITH MACHINE LEARNING...
  Current abstract length: 188 chars
  Tryi



  ⚠ Keeping original abstract

[444/510] 
Processing: Measurement, Analysis with Visualization for better Reliabil...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (248 chars)

[445/510] 
Processing: A thorough Examination of a Software Effort Estimating Model...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[446/510] 
Processing: Knowledge acquisition for an integrated project management s...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[447/510] 
Processing: Perspectives on a Cybersecurity Governance Framework Integra...
  Current abstract length: 178 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1260 chars)

[448/510] 
Processing: Predicting the Early Stage Software Development Effort using...
  Current abstract length: 191 ch



  ⚠ Keeping original abstract

[450/510] 
Processing: Business viability of potential software projects using arti...
  Current abstract length: 198 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (250 chars)

[451/510] 
Processing: Development of Simple Effort Estimation Model based on Fuzzy...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (247 chars)

[452/510] 
Processing: Implementation Framework for a Maritime Transport Navigation...
  Current abstract length: 177 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[453/510] 
Processing: LOCAL SENTIMENTS, GLOBAL IMPACT: THE CASE OF TIMISOARA AS TH...
  Current abstract length: 180 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[454/510] 
Processing: A novel case base reasoning and frequent pattern based decis...
  Current abstract length: 183 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  Scraping error: HTTPSConnectionPool(host='www.karlancer.com', port=443): Max retries exceeded with url: /api/file/1616065000.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7bc7f6ae7740>, 'Connection to www.karlancer.com timed out. (connect timeout=15)'))
  ⚠ Keeping original abstract

[455/510] 
Processing: A Preliminary Performance Eualuation Of Machinelearing Algor...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (249 chars)

[456/510] 
Processing: Improving the Accur



  ⚠ Keeping original abstract

[460/510] 
Processing: Quality of project management education and training program...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[461/510] 
Processing: Innovation Practice and Ability Cultivation of Software Engi...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (788 chars)

[462/510] 
Processing: A novel game theoretic algorithm for project selection under...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1122 chars)

[463/510] 
Processing: Theorizing IT project success: Direct and indirect effects i...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1357 chars)

[464/510] 
Processing: Artificial Intelligence: an astrodynamicits's perspective...
  Current abstract length: 183 chars
  Trying Semantic Scholar...



  ⚠ Keeping original abstract

[475/510] 
Processing: New direction of nuclear code development: artificial intell...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[476/510] 
Processing: Positive psychology in information technology project manage...
  Current abstract length: 190 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[477/510] 
Processing: Rightsizing project management for libraries...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (677 chars)

[478/510] 
Processing: Application of Improved CNN Technology in Medical Imaging Co...
  Current abstract length: 186 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1193 chars)

[479/510] 
Processing: ARTIFICIAL INTELLIGENCE IN DERMATOLOGY: CURRENT APPLICATIONS...
  Current abstract length: 194 chars
  Trying Sem



  ⚠ Keeping original abstract

[485/510] 
Processing: An Iterative Pixel-Based Dimensional Voting Model for High S...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (1564 chars)

[486/510] 
Processing: Integrating uncertainty in software effort estimation using ...
  Current abstract length: 195 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[487/510] 
Processing: POSSIBILITIES OF APPLICATION OF ARTIFICIAL INTELLIGENCE IN B...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[488/510] 
Processing: e-Waste Management Awareness Program in Solomon Island: A Pr...
  Current abstract length: 179 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[489/510] 
Processing: Lean IT with value stream mapping analysis: A case study in ...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1084 chars)

[490/510] 
Processing: Research on software quality assurance based on software qua...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (482 chars)

[491/510] 
Processing: Efficient software test management system based on microserv...
  Current abstract length: 181 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (606 chars)

[492/510] 
Processing: Replacing project managers in information technology project...
  Current abstract length: 188 chars
  Trying 



  ⚠ Keeping original abstract

[498/510] 
Processing: Search UTHealth Houston...
  Current abstract length: 185 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (632 chars)

[499/510] 
Processing: Integrated clinician decision supporting system for pneumoni...
  Current abstract length: 191 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1190 chars)

[500/510] 
Processing: Privacy Concerns and Data Donations: Do Societal Benefits Ma...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[501/510] 
Processing: THE SCOPE OPTIMIZATION OF SOFTWARE PROJECTS USING MODERN SOF...
  Current abstract length: 174 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[502/510] 
Processing: Measuring the productivity of computer systems development a...
  Current abstract length: 192 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (449 chars)

[503/510] 
Processing: Large-Scale Agile Frameworks: Agile Frameworks, Agile Infras...
  Current abstract length: 188 chars
  Trying Semantic Scholar...
  Trying to scrape URL...




  ⚠ Keeping original abstract

[504/510] 
Processing: Software risk management...
  Current abstract length: 171 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[505/510] 
Processing: Risk Management Models of a Project for the Implementation o...
  Current abstract length: 175 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[506/510] 
Processing: Beyond the Plan: The Psychology of Project Management...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  ✓ Found via Semantic Scholar (1538 chars)

[507/510] 
Processing: PENINGKATAN KOMPETENSI TEKNIK JARINGAN KOMPUTER DAN TELEKOMU...
  Current abstract length: 193 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ Keeping original abstract

[508/510] 
Processing: The Evolution of Project Management: Embracing AI, Sustainab...
  Current abstract length: 189 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ⚠ K



  ⚠ Keeping original abstract

[510/510] 
Processing: Software Selection by Using the Takagi-Sugeno Algorithm on t...
  Current abstract length: 182 chars
  Trying Semantic Scholar...
  Trying to scrape URL...
  ✓ Found by scraping (763 chars)

ENRICHMENT STATISTICS

Abstract Sources:
Abstract_Source
Original (no better found)    268
Scraped from URL              166
Semantic Scholar               54
CrossRef                       19
Original (sufficient)           3
Name: count, dtype: int64

Abstract Length Statistics:
Original average: 184 characters
Enriched average: 636 characters

Improved abstracts: 238 out of 510 (46.7%)

✓ Enriched data saved to: Full_Abstracts.xlsx


In [None]:
pwd

'/content'

# **Data concatenation code from SPARK Table 5**

code to convert wos.xls to wos.csv

In [None]:
import pandas as pd

# Input XLS file path
xls_file = "wos.xls"

# Output CSV file path
csv_file = "wos.csv"

# Read the XLS file (first sheet by default)
df = pd.read_excel(xls_file)

# Save as CSV (without index column)
df.to_csv(csv_file, index=False, encoding='utf-8')

print(f"Converted {xls_file} to {csv_file}")


Converted wos.xls to wos.csv


code to count total meta-data values.

In [None]:
!rm *.csv

In [None]:
import os
import pandas as pd

directory = r"/content"

def combine_csv_files(directory):
    all_dataframes = []
    total_expected = 0

    # Check each file individually with detailed info
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            print(f"\n--- Processing {filename} ---")

            # Read and analyze each file
            df = pd.read_csv(file_path)
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            print(f"Non-null rows: {df.dropna(how='all').shape[0]}")
            print(f"Duplicates within file: {df.duplicated().sum()}")

            total_expected += len(df)
            all_dataframes.append(df)

    print(f"\n=== SUMMARY ===")
    print(f"Total expected: {total_expected}")
    print(f"Files processed: {len(all_dataframes)}")

    # Combine without any modifications
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"Actual combined: {len(combined_df)}")
    print(f"Difference: {len(combined_df) - total_expected}")

combine_csv_files(directory)


--- Processing scopus_SPARK.csv ---
Shape: (4083, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 4083
Duplicates within file: 0

--- Processing ACM_SPARK.csv ---
Shape: (7272, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 7272
Duplicates within file: 0

--- Processing dblp-spark.csv ---
Shape: (711, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 711
Duplicates within file: 0

--- Processing google-spark.csv ---
Shape: (789, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 789
Duplicates within file: 12

--- Processing science_direct-spark.csv ---
Shape: (50, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 50
Duplicates within file: 0

--- Processing IEEE_SPARK.csv ---
Shape: (4536, 5)
Columns: ['title', 'authors', 'doi', 'year', 'abstract']
Non-null rows: 4536
Duplicates within file: 0

--- Processing springer-spark.csv ---
Shape: (546, 5)
Columns: 

run code below for table 5

In [None]:
import os
import pandas as pd

# Specify the directory where the CSV files are stored
directory = r"/content"

# Function to combine all CSV files in the specified directory
def combine_csv_files(directory):
    # Create a list to hold dataframes
    all_dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):  # Check if the file is a CSV
            file_path = os.path.join(directory, filename)  # Full path to the file
            df = pd.read_csv(file_path)  # Read the CSV file into a dataframe
            all_dataframes.append(df)  # Add the dataframe to the list

    # Concatenate all dataframes into one, assuming they all have the same columns
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    # combined_df = pd.concat(all_dataframes, ignore_index=True, sort=False)

    # Save the combined dataframe to a new CSV file
    combined_df.to_csv(os.path.join(directory, 'Combined_Results.csv'), index=False)

    print(f"Combined results saved to {os.path.join(directory, 'Combined_Results.csv')}")

# Run the function to combine the CSV files
combine_csv_files(directory)

Combined results saved to /content/Combined_Results.csv


# **SPARK deduplication from Table 6**

Deduplication is based on:
1. DOI matching
2. Fuzzy Matching of Titles and Authors

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ra

In [None]:
import pandas as pd
import os
import re
from fuzzywuzzy import fuzz

# Define paths
output_dir = r"/content"
screening_dir = os.path.join(output_dir, "Duplicates_Removed")
os.makedirs(screening_dir, exist_ok=True)
combined_path = os.path.join(output_dir, "Combined_Results.csv")
duplicates_removed_path = os.path.join(screening_dir, "Duplicates_Removed.csv")
no_duplicates_path = os.path.join(screening_dir, "No_Duplicates.csv")

# Load data
try:
    df_combined = pd.read_csv(combined_path)
    print(f"Loaded {len(df_combined)} articles.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}")
    exit()

# Preprocess and normalize data
def preprocess_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', str(text)).lower().strip()

df_combined['Normalized Title'] = df_combined['title'].apply(preprocess_text)
df_combined['Normalized Authors'] = df_combined['authors'].apply(lambda x: ''.join(sorted(re.sub(r'[^A-Za-z\s]', '', str(x)).lower().split(' '))))

# Fuzzy matching and debugging
def is_duplicate(row, existing):
    for exist in existing:
        if row['doi'] == exist['doi']:
            return True, "DOI Match"

    for exist in existing:
        if fuzz.ratio(row['Normalized Title'], exist['Normalized Title']) > 90 and row['year'] == exist['year']:
            return True, "Fuzzy Title Match"

    return False, ""

unique_articles = []
duplicates = []
duplicates_info = []

for _, row in df_combined.iterrows():
    is_dup, method = is_duplicate(row, unique_articles)
    if not is_dup:
        unique_articles.append(row)
    else:
        duplicates.append(row)
        duplicates_info.append({"row": row.to_dict(), "Identification Method": method}) # Corrected syntax

        print(f"Duplicate found: {row['title']} with method: {method}")

# Save results and print counts
pd.DataFrame(unique_articles).to_csv(no_duplicates_path, index=False)
pd.DataFrame(duplicates_info).to_csv(duplicates_removed_path, index=False)

print("Files saved: No duplicates and duplicates files.")
print(f"{len(duplicates)} duplicates were identified and removed.")

Loaded 20082 articles.
Duplicate found: Leveraging Machine Learning for Enhanced Bug Triaging in Open-Source Software Projects with method: Fuzzy Title Match
Duplicate found: Bibliographic review of AI applied to project management and its analysis in the context of the metalworking industry with method: DOI Match
Duplicate found: POLARIS: A framework to guide the development of Trustworthy AI systems with method: Fuzzy Title Match
Duplicate found: CodeFuse-13B: A Pretrained Multi-Lingual Code Large Language Model with method: DOI Match
Duplicate found: Lessons from Building StackSpot Al: A Contextualized AI Coding Assistant with method: DOI Match
Duplicate found: Resolving Code Review Comments with Machine Learning with method: DOI Match
Duplicate found: An Industry Case Study on Adoption of AI-based Programming Assistants with method: DOI Match
Duplicate found: Application of machine learning on software quality assurance and testing: A chronological survey with method: Fuzzy Title M

In [None]:
!rm -rf Duplicates_Removed/

# **SPARK Keywords Screening (Table 7)**

Phase-I

In [None]:
import pandas as pd
import os
import re

# Define the input CSV file path
input_csv_path = r"/content/Duplicates_Removed/No_Duplicates.csv"

# Create the Keyword_Filtered directory within the same directory as the input file
keyword_filtered_dir = os.path.join(os.path.dirname(input_csv_path), "Keyword_Filtered")
os.makedirs(keyword_filtered_dir, exist_ok=True)

# Define the preprocessing function for text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string if the text is NaN or non-string

    text = text.lower()  # Convert to lowercase
    pattern = re.compile(r'[^a-zA-Z0-9\s]+', re.IGNORECASE)
    return pattern.sub(' ', text)  # Replace special characters with a space

# Define the function to filter articles by keywords
def filter_articles_by_keywords(input_csv_path, output_csv_path,
                               and_keywords=None, or_keywords=None, exclude_keywords=None):

    df = pd.read_csv(input_csv_path)

    df['title'] = df['title'].apply(preprocess_text)
    df['abstract'] = df['abstract'].apply(preprocess_text)

    if and_keywords is None:
        and_keywords = []

    if or_keywords is None:
        or_keywords = []

    if exclude_keywords is None:
        exclude_keywords = []

    and_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in and_keywords]

    if and_patterns:
        and_mask = pd.Series(True, index=df.index)

        for pattern in and_patterns:
            and_mask &= (df['title'].str.contains(pattern, na=False) |
                        df['abstract'].str.contains(pattern, na=False))

        df = df[and_mask]

    or_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in or_keywords]

    if or_patterns:
        or_mask = pd.Series(False, index=df.index)

        for pattern in or_patterns:
            or_mask |= (df['title'].str.contains(pattern, na=False) |
                       df['abstract'].str.contains(pattern, na=False))

        df = df[or_mask]

    exclude_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in exclude_keywords]

    if exclude_patterns:
        exclude_mask = pd.Series(False, index=df.index)

        for pattern in exclude_patterns:
            exclude_mask |= (df['title'].str.contains(pattern, na=False) |
                           df['abstract'].str.contains(pattern, na=False))

        df = df[~exclude_mask]

    df.to_csv(output_csv_path, index=False)

    print(f"Filtered articles saved to {output_csv_path}")

# Define the output file path within the same directory as the input file
output_csv_path = os.path.join(keyword_filtered_dir, "Screened.csv")

# Define the keywords for filtering
# and_keywords = ["software project management", "review"]
and_keywords = ["project", "management"]

or_keywords = ["software project management", "software project", "project management",
            "IT project management", "software engineering management", "project performance",
            "SPM", "PMBOK", "software development management",
            "agile project management", "scrum management", "DevOps management",
            "requirements management", "project requirements management",
            "risk management", "quality management", "team management",
            "resource management", "project schedule management",
            "software process management", "waterfall project management",
            "agile management", "waterfall management", "project governance",
            "software project planning", "project planning", "project management tool",
            "stakeholder management", "procurement management", "schedule management",
            "scope management", "project integration", "project integration management",
            "cost management", "project cost management", "project quality management",
            "project resource management", "communication management", "project communication management",
            "project risk management",


            "large language model", "LLM", "generative AI", "GenAI",
            "language models", "transformer models", "deep learning",
            "machine learning", "neural networks", "artificial intelligence",
            "AI", "NLP models", "GPT", "BERT", "ChatGPT", "OpenAI",
            "Anthropic", "Claude", "Bard", "Gemini", "natural language processing",
            "conversational AI", "text generation", "language understanding",
            "pre-trained models", "foundation models", "automated reasoning",
            "natural language generation", "language model", "transformer",
            "attention mechanism", "fine-tuning", "prompt engineering",
            "few-shot learning", "zero-shot learning", "transfer learning",
            "neural language model", "autoregressive model", "seq2seq",
            "encoder-decoder", "LSTM", "GRU", "RNN", "CNN", "generation models",


            "systematic literature review", "systematic review", "literature review",
            "mapping study", "survey paper", "short review", "review", "survey",
            "meta-analysis", "scoping review", "narrative review", "critical review",
            "state-of-the-art", "bibliometric analysis", "content analysis",
            "thematic analysis", "qualitative review", "quantitative review",
            "comprehensive review", "overview", "research synthesis",
            "evidence synthesis", "research review", "academic review",
            "scientific review", "technical review", "comparative study",
            "analytical review", "descriptive review", "exploratory review"]

exclude_keywords = []

# Call the function to filter articles and save the result
filter_articles_by_keywords(input_csv_path, output_csv_path, and_keywords,
                           or_keywords, exclude_keywords)

Filtered articles saved to /content/Duplicates_Removed/Keyword_Filtered/Screened.csv


# **SPARK Keywords Screening (Phase II)**

In [None]:
import pandas as pd
import os
import re

# Define the input CSV file path
# input_csv_path = r"No_Duplicates.csv"
input_csv_path = r"/content/Duplicates_Removed/No_Duplicates.csv"

# Create the Keyword_Filtered directory within the same directory as the input file
keyword_filtered_dir = os.path.join(os.path.dirname(input_csv_path), "Keyword_Filtered")
os.makedirs(keyword_filtered_dir, exist_ok=True)

# Define the preprocessing function for text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string if the text is NaN or non-string

    text = text.lower()  # Convert to lowercase
    pattern = re.compile(r'[^a-zA-Z0-9\s]+', re.IGNORECASE)
    return pattern.sub(' ', text)  # Replace special characters with a space

# Define the function to filter articles by keywords
def filter_articles_by_keywords(input_csv_path, output_csv_path,
                               and_keywords=None, or_keywords=None, exclude_keywords=None):

    df = pd.read_csv(input_csv_path)

    df['title'] = df['title'].apply(preprocess_text)
    df['abstract'] = df['abstract'].apply(preprocess_text)

    if and_keywords is None:
        and_keywords = []

    if or_keywords is None:
        or_keywords = []

    if exclude_keywords is None:
        exclude_keywords = []

    and_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in and_keywords]

    if and_patterns:
        and_mask = pd.Series(True, index=df.index)

        for pattern in and_patterns:
            and_mask &= (df['title'].str.contains(pattern, na=False) |
                        df['abstract'].str.contains(pattern, na=False))

        df = df[and_mask]

    or_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in or_keywords]

    if or_patterns:
        or_mask = pd.Series(False, index=df.index)

        for pattern in or_patterns:
            or_mask |= (df['title'].str.contains(pattern, na=False) |
                       df['abstract'].str.contains(pattern, na=False))

        df = df[or_mask]

    exclude_patterns = [re.compile(rf'{keyword}', re.IGNORECASE) for keyword in exclude_keywords]

    if exclude_patterns:
        exclude_mask = pd.Series(False, index=df.index)

        for pattern in exclude_patterns:
            exclude_mask |= (df['title'].str.contains(pattern, na=False) |
                           df['abstract'].str.contains(pattern, na=False))

        df = df[~exclude_mask]

    df.to_csv(output_csv_path, index=False)

    print(f"Filtered articles saved to {output_csv_path}")

# Define the output file path within the same directory as the input file
output_csv_path = os.path.join(keyword_filtered_dir, "Screened.csv")

# Define the keywords for filtering
# and_keywords = ["software project management", "review"]
and_keywords = ["project", "management"]

or_keywords = [
    "LLM", "Large Language Model", "Language Model", "LM", "PLM", "Pre-trained Language Model",
    "Pretrained Model", "Pretraining", "Pre-training", "Natural Language Processing", "NLP",
    "Artificial Intelligence", "AI", "Transformer", "Sequence Model", "Attention Model",
    "Transfer Learning", "Generative AI", "GPT", "ChatGPT", "Codex", "BERT", "T5", "Gemini",
    "Mistral", "LLaMA",

    "SPM", "Software Project Management", "Software Development Project Management",
    "Software Engineering Project Management", "software engineering management",
    "software project", "IT project management", "IT project", "software development project",
    "Software Process Management", "Software Lifecycle Management", "Agile Project Management",
    "Hybrid Project Management", "Software Project Scheduling Problem", "SPSP",
    "Software Development", "Software Lifecycle", "Software Life Cycle", "SDLC",
    "Requirements Management", "PMBOK"

    # "PMBOK", "Project Management Body of Knowledge", "project management framework", "PM framework",
    # "project management standard", "Project Management Practices", "Project Management Institute",
    # "Integration", "Scope", "Schedule", "Cost", "Quality", "Resource", "Communications", "Risk",
    # "Stakeholder", "Procurement",

    # "support", "practice", "mechanism", "automation", "communication", "information processing",
    # "outcome", "research design", "empirical", "case study", "survey", "interview", "effectiveness",
    # "measurement", "benefit", "challenge", "limitation", "evaluate", "integrate", "predict", "forecast",
    # "assessment", "original studies", "prediction", "validation", "verification", "approach", "process",
    # "deploy", "implementation", "detection", "identification", "analysis", "classification", "monitoring",
    # "error", "solution"
]

exclude_keywords = ["review", "systematic literature review", "SLR", "tertiary study"]

# Call the function to filter articles and save the result
filter_articles_by_keywords(input_csv_path, output_csv_path, and_keywords,
                           or_keywords, exclude_keywords)

Filtered articles saved to /content/Duplicates_Removed/Keyword_Filtered/Screened.csv


## **Extracting non-processed meta-data for Title and Abstract Screening**

After SPARK processing, all meta-data is in lowercase and also stopwords and punctuations are not there. So the code below retrives back the original meta-data from the dataset file that contains the original (non-processed) data.

In [None]:
import pandas as pd
import os
import numpy as np

# Define file paths
# original_csv_path = r"/content/Duplicates_Removed/No_Duplicates.csv"  # Original 13,454 papers
# filtered_csv_path = r"/content/Duplicates_Removed/Keyword_Filtered/Screened.csv"  # Your 1,152 filtered papers

original_csv_path = r"No_Duplicates.csv"  # Original 20,006 papers
filtered_csv_path = r"128_papers.csv"  # Your 128 filtered papers

# Define output directory and file path
output_dir = os.path.join(os.path.dirname(filtered_csv_path), "Original_Metadata")
os.makedirs(output_dir, exist_ok=True)
output_csv_path = os.path.join(output_dir, "Screened_Original_Metadata.csv")

print("Loading files...")
print("-" * 50)

# Load both CSV files
df_original = pd.read_csv(original_csv_path)
df_filtered = pd.read_csv(filtered_csv_path)

print(f"Original file: {len(df_original)} papers")
print(f"Filtered file: {len(df_filtered)} papers")
print("-" * 50)

# Add normalized columns to original dataframe for matching
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    pattern = re.compile(r'[^a-zA-Z0-9\s]+', re.IGNORECASE)
    return pattern.sub(' ', text).strip()

df_original['Normalized Title'] = df_original['title'].apply(preprocess_text)
df_original['Normalized Authors'] = df_original['authors'].apply(lambda x: ''.join(sorted(re.sub(r'[^A-Za-z\s]', '', str(x)).lower().split(' '))))


# Check for missing DOIs in both files
original_missing_doi = df_original['doi'].isna().sum()
filtered_missing_doi = df_filtered['doi'].isna().sum()

print(f"Missing DOIs in original file: {original_missing_doi}")
print(f"Missing DOIs in filtered file: {filtered_missing_doi}")
print("-" * 50)

# Strategy: Try multiple matching approaches
print("Attempting to match papers...")

# Initialize new columns in df_filtered
df_filtered['match_source'] = 'unmatched'
df_filtered['original_title'] = None
df_filtered['original_abstract'] = None
df_filtered['original_authors'] = None
df_filtered['original_doi'] = None # Add original DOI column

# Approach 1: Match by DOI (for papers with DOI)
# Create a mapping from original data with DOI
doi_mapping = df_original[df_original['doi'].notna()].set_index('doi')[['title', 'abstract', 'authors']].to_dict('index')

matched_by_doi = 0
for idx, row in df_filtered.iterrows():
    doi = row['doi']
    if pd.notna(doi) and doi in doi_mapping:
        df_filtered.at[idx, 'original_title'] = doi_mapping[doi]['title']
        df_filtered.at[idx, 'original_abstract'] = doi_mapping[doi]['abstract']
        df_filtered.at[idx, 'original_authors'] = doi_mapping[doi]['authors']
        df_filtered.at[idx, 'original_doi'] = doi # Capture the original DOI
        df_filtered.at[idx, 'match_source'] = 'doi'
        matched_by_doi += 1

print(f"✓ Matched by DOI: {matched_by_doi}")

# Approach 2: Match by normalized title for papers without DOI or unmatched
unmatched_mask_title = (df_filtered['match_source'] == 'unmatched') & (df_filtered['Normalized Title'].notna())
unmatched_count_title = unmatched_mask_title.sum()

if unmatched_count_title > 0:
    print(f"Attempting to match {unmatched_count_title} papers by normalized title...")

    # Group original data by Normalized Title
    original_grouped_by_title = df_original.groupby('Normalized Title')[['title', 'abstract', 'authors', 'doi']].apply(lambda x: x.to_dict('records')).to_dict()

    matched_by_title = 0
    for idx, row in df_filtered[unmatched_mask_title].iterrows():
        norm_title = row['Normalized Title']
        if norm_title in original_grouped_by_title:
            # Take the first match found (assuming normalized title is a strong indicator)
            original_record = original_grouped_by_title[norm_title][0]
            df_filtered.at[idx, 'original_title'] = original_record.get('title')
            df_filtered.at[idx, 'original_abstract'] = original_record.get('abstract')
            df_filtered.at[idx, 'original_authors'] = original_record.get('authors')
            df_filtered.at[idx, 'original_doi'] = original_record.get('doi') # Capture original DOI
            df_filtered.at[idx, 'match_source'] = 'normalized_title'
            matched_by_title += 1

    print(f"✓ Matched by Normalized Title: {matched_by_title}")


# Approach 3: Match by normalized authors + year for remaining unmatched
unmatched_mask_author_year = (df_filtered['match_source'] == 'unmatched') & (df_filtered['Normalized Authors'].notna()) & (df_filtered['year'].notna())
unmatched_count_author_year = unmatched_mask_author_year.sum()


if unmatched_count_author_year > 0:
    print(f"Attempting to match {unmatched_count_author_year} papers by authors + year...")

    # Create composite key and group original data
    df_original['author_year_key'] = df_original['Normalized Authors'].astype(str) + '_' + df_original['year'].astype(str)
    original_grouped_by_author_year = df_original.groupby('author_year_key')[['title', 'abstract', 'authors', 'doi']].apply(lambda x: x.to_dict('records')).to_dict()


    matched_by_author_year = 0
    for idx, row in df_filtered[unmatched_mask_author_year].iterrows():
        key = str(row['Normalized Authors']) + '_' + str(row['year'])
        if key in original_grouped_by_author_year:
            # Take the first match found
            original_record = original_grouped_by_author_year[key][0]
            df_filtered.at[idx, 'original_title'] = original_record.get('title')
            df_filtered.at[idx, 'original_abstract'] = original_record.get('abstract')
            df_filtered.at[idx, 'original_authors'] = original_record.get('authors')
            df_filtered.at[idx, 'original_doi'] = original_record.get('doi') # Capture original DOI
            df_filtered.at[idx, 'match_source'] = 'authors_year'
            matched_by_author_year += 1

    print(f"✓ Matched by Authors + Year: {matched_by_author_year}")

print("-" * 50)

# Final statistics
total_matched = (df_filtered['match_source'] != 'unmatched').sum()
total_unmatched = (df_filtered['match_source'] == 'unmatched').sum()

print(f"\nFinal Matching Results:")
print(f"Total papers in filtered file: {len(df_filtered)}")
print(f"Successfully matched: {total_matched}")
print(f"Unmatched: {total_unmatched}")
print("-" * 50)

# Save the filtered dataframe with original metadata
df_filtered.to_csv(output_csv_path, index=False)

print(f"✓ Successfully extracted original metadata!")
print(f"✓ Saved to: {output_csv_path}")
print("-" * 50)

# Display sample of the first few papers that were successfully matched
print("\nSample of extracted papers (first 3 successfully matched):")
print("-" * 50)
matched_samples = df_filtered[df_filtered['match_source'] != 'unmatched'].head(3)
if not matched_samples.empty:
    for idx, row in matched_samples.iterrows():
        print(f"\nOriginal Title: {row['original_title']}")
        print(f"Original Authors: {row['original_authors']}")
        print(f"Original Year: {row['year']}") # Year should be the same
        print(f"Original DOI: {row['original_doi']}")
        print(f"Original Abstract: {str(row['original_abstract'])[:150]}...")  # First 150 characters
        print(f"Matched by: {row['match_source']}")
        print("-" * 50)
else:
    print("No papers were successfully matched.")
    print("-" * 50)

# Summary statistics
print("\nSummary Statistics:")
print(f"Total papers in filtered file: {len(df_filtered)}")
print(f"Total successfully matched: {total_matched}")
print(f"Total unmatched: {total_unmatched}")
print("-" * 50)

if total_matched < len(df_filtered):
    print("\n⚠ Warning: Some papers from filtered file were not found in original file or could not be matched!")
else:
    print("\n✓ All filtered papers were successfully matched to the original data!")

Loading files...
--------------------------------------------------
Original file: 16008 papers
Filtered file: 128 papers
--------------------------------------------------
Missing DOIs in original file: 619
Missing DOIs in filtered file: 9
--------------------------------------------------
Attempting to match papers...
✓ Matched by DOI: 113
--------------------------------------------------

Final Matching Results:
Total papers in filtered file: 128
Successfully matched: 113
Unmatched: 15
--------------------------------------------------
✓ Successfully extracted original metadata!
✓ Saved to: Original_Metadata/Screened_Original_Metadata.csv
--------------------------------------------------

Sample of extracted papers (first 3 successfully matched):
--------------------------------------------------

Original Title: Enhancing software effort estimation with random forest tuning and adaptive decision strategies
Original Authors: Priya Varshini, A.G.; Anitha Kumari, K.; Ramakrishnan, S

In [None]:
!rm -rf Keyword_Filtered/
#

# Comparing SPARK results before and after query modifications

In [None]:
import pandas as pd
import numpy as np

# Read both CSV files
df1 = pd.read_csv('screened_1152.csv')
df2 = pd.read_csv('screened_1180.csv')

print("=" * 80)
print("CSV FILES COMPARISON ANALYSIS")
print("=" * 80)

# Basic file information
print(f"\nFile 1 (screened_1152.csv): {len(df1)} records")
print(f"File 2 (screenedd_1180.csv): {len(df2)} records")

# Create unique identifiers for comparison
# Primary: DOI, Fallback: Normalized Title + Normalized Authors
def create_key(row):
    if pd.notna(row.get('doi')) and str(row['doi']).strip():
        return f"doi:{str(row['doi']).strip().lower()}"
    title = str(row.get('Normalized Title', row.get('title', ''))).strip().lower()
    authors = str(row.get('Normalized Authors', row.get('authors', ''))).strip().lower()
    return f"{title}|||{authors}"

# Add unique keys to both dataframes
df1['_key'] = df1.apply(create_key, axis=1)
df2['_key'] = df2.apply(create_key, axis=1)

# Remove empty keys
df1_clean = df1[df1['_key'] != '|||']
df2_clean = df2[df2['_key'] != '|||']

# Find duplicates and unique records
keys1 = set(df1_clean['_key'])
keys2 = set(df2_clean['_key'])

duplicate_keys = keys1.intersection(keys2)
unique_to_file1_keys = keys1 - keys2
unique_to_file2_keys = keys2 - keys1

# Get the actual records
duplicates = df1_clean[df1_clean['_key'].isin(duplicate_keys)]
unique_to_file1 = df1_clean[df1_clean['_key'].isin(unique_to_file1_keys)]
unique_to_file2 = df2_clean[df2_clean['_key'].isin(unique_to_file2_keys)]

# Print results
print("\n" + "=" * 80)
print("RESULTS")
print("=" * 80)
print(f"\nTotal records in File 1 (cleaned): {len(df1_clean)}")
print(f"Total records in File 2 (cleaned): {len(df2_clean)}")
print(f"\nDuplicates (in both files): {len(duplicates)}")
print(f"Unique to File 1 only: {len(unique_to_file1)}")
print(f"Unique to File 2 only: {len(unique_to_file2)}")
print(f"\nTotal unique records across both files: {len(duplicate_keys) + len(unique_to_file1_keys) + len(unique_to_file2_keys)}")

# Percentage calculations
if len(df1_clean) > 0:
    print(f"\nDuplicates as % of File 1: {(len(duplicates) / len(df1_clean) * 100):.2f}%")
if len(df2_clean) > 0:
    print(f"Duplicates as % of File 2: {(len(duplicates) / len(df2_clean) * 100):.2f}%")

# Show sample duplicates
print("\n" + "=" * 80)
print("SAMPLE DUPLICATES (first 5)")
print("=" * 80)
if len(duplicates) > 0:
    sample_cols = ['title', 'authors', 'year', 'doi']
    available_cols = [col for col in sample_cols if col in duplicates.columns]
    print(duplicates[available_cols].head(5).to_string(index=False))
else:
    print("No duplicates found!")

# Show sample unique to File 1
print("\n" + "=" * 80)
print("SAMPLE UNIQUE TO FILE 1 (first 5)")
print("=" * 80)
if len(unique_to_file1) > 0:
    available_cols = [col for col in sample_cols if col in unique_to_file1.columns]
    print(unique_to_file1[available_cols].head(5).to_string(index=False))
else:
    print("No unique records in File 1!")

# Show sample unique to File 2
print("\n" + "=" * 80)
print("SAMPLE UNIQUE TO FILE 2 (first 5)")
print("=" * 80)
if len(unique_to_file2) > 0:
    available_cols = [col for col in sample_cols if col in unique_to_file2.columns]
    print(unique_to_file2[available_cols].head(5).to_string(index=False))
else:
    print("No unique records in File 2!")

# Optional: Save results to CSV files
print("\n" + "=" * 80)
print("SAVING RESULTS TO CSV FILES")
print("=" * 80)

# Remove the temporary key column before saving
duplicates_export = duplicates.drop(columns=['_key'])
unique_to_file1_export = unique_to_file1.drop(columns=['_key'])
unique_to_file2_export = unique_to_file2.drop(columns=['_key'])

duplicates_export.to_csv('duplicates.csv', index=False)
unique_to_file1_export.to_csv('unique_to_file1.csv', index=False)
unique_to_file2_export.to_csv('unique_to_file2.csv', index=False)

print(f"✓ Saved duplicates.csv ({len(duplicates)} records)")
print(f"✓ Saved unique_to_file1.csv ({len(unique_to_file1)} records)")
print(f"✓ Saved unique_to_file2.csv ({len(unique_to_file2)} records)")
print("\nAnalysis complete!")

CSV FILES COMPARISON ANALYSIS

File 1 (screened_1152.csv): 1152 records
File 2 (screenedd_1180.csv): 1180 records

RESULTS

Total records in File 1 (cleaned): 1152
Total records in File 2 (cleaned): 1180

Duplicates (in both files): 1052
Unique to File 1 only: 100
Unique to File 2 only: 128

Total unique records across both files: 1280

Duplicates as % of File 1: 91.32%
Duplicates as % of File 2: 89.15%

SAMPLE DUPLICATES (first 5)
                                                                                                                           title                                                                                            authors  year                                           doi
the distributed situational centers system as an instrument of state and corporate strategic goal setting in the digital economy Z.K. Avdeeva and S.V. Kovriga and V.E. Lepskiy and A.N. Raikov and B.B. Slavin and A.A. Zatsarinny  2020 https://doi.org/10.1016/j.ifacol.2020.12.2156
    

modified code to also include the index of the data.

In [None]:
import pandas as pd
import numpy as np

# Read both CSV files
df1 = pd.read_csv('screened_1152.csv')
df2 = pd.read_csv('screened_1180.csv')

print("=" * 80)
print("CSV FILES COMPARISON ANALYSIS")
print("=" * 80)

# Basic file information
print(f"\nFile 1 (screened_1152.csv): {len(df1)} records")
print(f"File 2 (screened_1180.csv): {len(df2)} records")

# Store original indices
df1['_original_index_file1'] = df1.index
df2['_original_index_file2'] = df2.index

# Create unique identifiers for comparison
# Primary: DOI, Fallback: Normalized Title + Normalized Authors
def create_key(row):
    if pd.notna(row.get('doi')) and str(row['doi']).strip():
        return f"doi:{str(row['doi']).strip().lower()}"
    title = str(row.get('Normalized Title', row.get('title', ''))).strip().lower()
    authors = str(row.get('Normalized Authors', row.get('authors', ''))).strip().lower()
    return f"{title}|||{authors}"

# Add unique keys to both dataframes
df1['_key'] = df1.apply(create_key, axis=1)
df2['_key'] = df2.apply(create_key, axis=1)

# Remove empty keys
df1_clean = df1[df1['_key'] != '|||'].copy()
df2_clean = df2[df2['_key'] != '|||'].copy()

# Find duplicates and unique records
keys1 = set(df1_clean['_key'])
keys2 = set(df2_clean['_key'])

duplicate_keys = keys1.intersection(keys2)
unique_to_file1_keys = keys1 - keys2
unique_to_file2_keys = keys2 - keys1

# Get the actual records
duplicates_df1 = df1_clean[df1_clean['_key'].isin(duplicate_keys)].copy()
duplicates_df2 = df2_clean[df2_clean['_key'].isin(duplicate_keys)].copy()
unique_to_file1 = df1_clean[df1_clean['_key'].isin(unique_to_file1_keys)].copy()
unique_to_file2 = df2_clean[df2_clean['_key'].isin(unique_to_file2_keys)].copy()

# Create a mapping of duplicates with indices from both files
duplicate_mapping = []
for key in duplicate_keys:
    file1_records = duplicates_df1[duplicates_df1['_key'] == key]
    file2_records = duplicates_df2[duplicates_df2['_key'] == key]

    for _, row1 in file1_records.iterrows():
        for _, row2 in file2_records.iterrows():
            duplicate_mapping.append({
                'key': key,
                'file1_index': row1['_original_index_file1'],
                'file2_index': row2['_original_index_file2'],
                'title': row1.get('title', row1.get('Normalized Title', '')),
                'authors': row1.get('authors', row1.get('Normalized Authors', '')),
                'year': row1.get('year', ''),
                'doi': row1.get('doi', '')
            })

# Create DataFrame from duplicate mapping
duplicate_mapping_df = pd.DataFrame(duplicate_mapping)

# Print results
print("\n" + "=" * 80)
print("RESULTS")
print("=" * 80)
print(f"\nTotal records in File 1 (cleaned): {len(df1_clean)}")
print(f"Total records in File 2 (cleaned): {len(df2_clean)}")
print(f"\nDuplicates (in both files): {len(duplicate_keys)}")
print(f"Unique to File 1 only: {len(unique_to_file1)}")
print(f"Unique to File 2 only: {len(unique_to_file2)}")
print(f"\nTotal unique records across both files: {len(duplicate_keys) + len(unique_to_file1_keys) + len(unique_to_file2_keys)}")

# Percentage calculations
if len(df1_clean) > 0:
    print(f"\nDuplicates as % of File 1: {(len(duplicate_keys) / len(df1_clean) * 100):.2f}%")
if len(df2_clean) > 0:
    print(f"Duplicates as % of File 2: {(len(duplicate_keys) / len(df2_clean) * 100):.2f}%")

# Show sample duplicates with indices
print("\n" + "=" * 80)
print("SAMPLE DUPLICATES WITH INDICES (first 10)")
print("=" * 80)
if len(duplicate_mapping_df) > 0:
    display_cols = ['file1_index', 'file2_index', 'title', 'authors', 'year', 'doi']
    available_cols = [col for col in display_cols if col in duplicate_mapping_df.columns]
    print(duplicate_mapping_df[available_cols].head(10).to_string(index=False))
else:
    print("No duplicates found!")

# Show sample unique to File 1 with indices
print("\n" + "=" * 80)
print("SAMPLE UNIQUE TO FILE 1 WITH INDICES (first 5)")
print("=" * 80)
if len(unique_to_file1) > 0:
    sample_cols = ['_original_index_file1', 'title', 'authors', 'year', 'doi']
    available_cols = [col for col in sample_cols if col in unique_to_file1.columns]
    print(unique_to_file1[available_cols].head(5).to_string(index=False))
else:
    print("No unique records in File 1!")

# Show sample unique to File 2 with indices
print("\n" + "=" * 80)
print("SAMPLE UNIQUE TO FILE 2 WITH INDICES (first 5)")
print("=" * 80)
if len(unique_to_file2) > 0:
    sample_cols = ['_original_index_file2', 'title', 'authors', 'year', 'doi']
    available_cols = [col for col in sample_cols if col in unique_to_file2.columns]
    print(unique_to_file2[available_cols].head(5).to_string(index=False))
else:
    print("No unique records in File 2!")

# Optional: Save results to CSV files
print("\n" + "=" * 80)
print("SAVING RESULTS TO CSV FILES")
print("=" * 80)

# Save duplicate mapping (with indices from both files)
if len(duplicate_mapping_df) > 0:
    duplicate_mapping_df.to_csv('duplicates_with_indices.csv', index=False)
    print(f"✓ Saved duplicates_with_indices.csv ({len(duplicate_mapping_df)} records)")

# Remove temporary columns before saving individual file results
duplicates_df1_export = duplicates_df1.drop(columns=['_key']).rename(columns={'_original_index_file1': 'original_index'})
unique_to_file1_export = unique_to_file1.drop(columns=['_key']).rename(columns={'_original_index_file1': 'original_index'})
unique_to_file2_export = unique_to_file2.drop(columns=['_key']).rename(columns={'_original_index_file2': 'original_index'})

duplicates_df1_export.to_csv('duplicates_file1_with_index.csv', index=False)
unique_to_file1_export.to_csv('unique_to_file1_with_index.csv', index=False)
unique_to_file2_export.to_csv('unique_to_file2_with_index.csv', index=False)

print(f"✓ Saved duplicates_file1_with_index.csv ({len(duplicates_df1)} records from File 1)")
print(f"✓ Saved unique_to_file1_with_index.csv ({len(unique_to_file1)} records)")
print(f"✓ Saved unique_to_file2_with_index.csv ({len(unique_to_file2)} records)")

# Print summary of index ranges
print("\n" + "=" * 80)
print("INDEX SUMMARY")
print("=" * 80)
if len(duplicate_mapping_df) > 0:
    print(f"\nDuplicates:")
    print(f"  File 1 index range: {duplicate_mapping_df['file1_index'].min():.0f} - {duplicate_mapping_df['file1_index'].max():.0f}")
    print(f"  File 2 index range: {duplicate_mapping_df['file2_index'].min():.0f} - {duplicate_mapping_df['file2_index'].max():.0f}")

if len(unique_to_file1) > 0:
    print(f"\nUnique to File 1:")
    print(f"  Index range: {unique_to_file1['_original_index_file1'].min():.0f} - {unique_to_file1['_original_index_file1'].max():.0f}")

if len(unique_to_file2) > 0:
    print(f"\nUnique to File 2:")
    print(f"  Index range: {unique_to_file2['_original_index_file2'].min():.0f} - {unique_to_file2['_original_index_file2'].max():.0f}")

print("\nAnalysis complete!")

CSV FILES COMPARISON ANALYSIS

File 1 (screened_1152.csv): 1152 records
File 2 (screened_1180.csv): 1180 records

RESULTS

Total records in File 1 (cleaned): 1152
Total records in File 2 (cleaned): 1180

Duplicates (in both files): 1052
Unique to File 1 only: 100
Unique to File 2 only: 128

Total unique records across both files: 1280

Duplicates as % of File 1: 91.32%
Duplicates as % of File 2: 89.15%

SAMPLE DUPLICATES WITH INDICES (first 10)
 file1_index  file2_index                                                                                                                                                                                                  title                                                                                                                                                                                                                                                                                                                                       

# **Comparing SPARK 398 papers with our 480 papers.**

In [None]:
!pip install pandas openpyxl fuzzywuzzy python-Levenshtein



In [None]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import openpyxl

def preprocess_text(text):
    """
    Preprocess text by removing special characters, converting to lowercase,
    and removing extra whitespaces
    """
    if pd.isna(text) or text == '':
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove special characters and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def compare_records(record1_title, record1_abstract, record2_title, record2_abstract,
                   title_threshold=90, abstract_threshold=80):
    """
    Compare two records based on title and abstract similarity
    Returns True if records are considered duplicates
    """
    # Preprocess texts
    title1 = preprocess_text(record1_title)
    abstract1 = preprocess_text(record1_abstract)
    title2 = preprocess_text(record2_title)
    abstract2 = preprocess_text(record2_abstract)

    # Check title similarity
    title_similarity = fuzz.ratio(title1, title2)

    # Check abstract similarity
    abstract_similarity = fuzz.ratio(abstract1, abstract2)

    # Consider as duplicate if:
    # 1. Title similarity >= title_threshold, OR
    # 2. Both title similarity >= 70 AND abstract similarity >= abstract_threshold

    if title_similarity >= title_threshold:
        return True, title_similarity, abstract_similarity
    elif title_similarity >= 70 and abstract_similarity >= abstract_threshold:
        return True, title_similarity, abstract_similarity
    else:
        return False, title_similarity, abstract_similarity

def find_duplicates_between_files(file1_path, file2_path, output_path):
    """
    Main function to compare two XLSX files and find duplicates
    """
    try:
        # Read the two XLSX files
        print("Loading files...")
        df1 = pd.read_excel(file1_path)
        df2 = pd.read_excel(file2_path)

        print(f"File 1: {len(df1)} records")
        print(f"File 2: {len(df2)} records")

        # Ensure required columns exist
        required_cols = ['title', 'abstract']
        for col in required_cols:
            if col not in df1.columns:
                raise ValueError(f"Column '{col}' not found in file 1")
            if col not in df2.columns:
                raise ValueError(f"Column '{col}' not found in file 2")

        # Create result dataframe based on file 1
        result_df = df1.copy()

        # Add new columns for comparison results
        result_df['duplicate_status'] = ''
        result_df['file2_row_number'] = ''
        result_df['title_similarity'] = ''
        result_df['abstract_similarity'] = ''

        print("Comparing records...")

        # Compare each record in file1 with all records in file2
        for idx1, row1 in df1.iterrows():
            print(f"Processing record {idx1 + 1}/{len(df1)}")

            best_match_found = False
            best_match_row = None
            best_title_sim = 0
            best_abstract_sim = 0

            # Compare with each record in file2
            for idx2, row2 in df2.iterrows():
                is_duplicate, title_sim, abstract_sim = compare_records(
                    row1['title'], row1['abstract'],
                    row2['title'], row2['abstract']
                )

                if is_duplicate:
                    # If this is the first match or a better match, update
                    if not best_match_found or title_sim > best_title_sim:
                        best_match_found = True
                        best_match_row = idx2 + 2  # +2 because Excel rows start from 1 and we have header
                        best_title_sim = title_sim
                        best_abstract_sim = abstract_sim

            # Update result dataframe
            if best_match_found:
                result_df.at[idx1, 'duplicate_status'] = 'Duplicate Found'
                result_df.at[idx1, 'file2_row_number'] = best_match_row
                result_df.at[idx1, 'title_similarity'] = f"{best_title_sim}%"
                result_df.at[idx1, 'abstract_similarity'] = f"{best_abstract_sim}%"
            else:
                result_df.at[idx1, 'duplicate_status'] = 'Unique/New/Not a duplicate'
                result_df.at[idx1, 'file2_row_number'] = 'N/A'
                result_df.at[idx1, 'title_similarity'] = 'N/A'
                result_df.at[idx1, 'abstract_similarity'] = 'N/A'

        # Save results
        print(f"Saving results to {output_path}")
        result_df.to_excel(output_path, index=False, engine='openpyxl')

        # Print summary statistics
        duplicates_found = len(result_df[result_df['duplicate_status'] == 'Duplicate Found'])
        unique_records = len(result_df[result_df['duplicate_status'] == 'Unique/New/Not a duplicate'])

        print(f"\n=== COMPARISON RESULTS ===")
        print(f"Total records in File 1: {len(df1)}")
        print(f"Duplicates found: {duplicates_found}")
        print(f"Unique records: {unique_records}")
        print(f"Duplicate percentage: {(duplicates_found/len(df1))*100:.2f}%")
        print(f"Results saved to: {output_path}")

        return result_df

    except Exception as e:
        print(f"Error occurred: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Specify file paths
    file1_path = "spark_papers.xlsx"  # Replace with your actual file path
    file2_path = "our_papers.xlsx"  # Replace with your actual file path
    output_path = "comparison_results.xlsx"

    # Run the comparison
    results = find_duplicates_between_files(file1_path, file2_path, output_path)

    if results is not None:
        print("\nComparison completed successfully!")
        print("\nFirst few results:")
        print(results[['title', 'duplicate_status', 'file2_row_number', 'title_similarity']].head())
    else:
        print("Comparison failed. Please check your file paths and formats.")

Loading files...
File 1: 398 records
File 2: 480 records
Comparing records...
Processing record 1/398
Processing record 2/398
Processing record 3/398
Processing record 4/398
Processing record 5/398
Processing record 6/398
Processing record 7/398
Processing record 8/398
Processing record 9/398
Processing record 10/398
Processing record 11/398
Processing record 12/398
Processing record 13/398
Processing record 14/398
Processing record 15/398
Processing record 16/398
Processing record 17/398
Processing record 18/398
Processing record 19/398
Processing record 20/398
Processing record 21/398
Processing record 22/398
Processing record 23/398
Processing record 24/398
Processing record 25/398
Processing record 26/398
Processing record 27/398
Processing record 28/398
Processing record 29/398
Processing record 30/398
Processing record 31/398
Processing record 32/398
Processing record 33/398
Processing record 34/398
Processing record 35/398
Processing record 36/398
Processing record 37/398
Proces

In [None]:
!rm -rf *

# **Converting CSV to XLSX.**

In [None]:
import pandas as pd

# Replace with your CSV file path
csv_file = "Screened_Original_Metadata.csv"
xlsx_file = "Title_Abstract_Screening_128.xlsx"

# Read CSV
try:
    df = pd.read_csv(csv_file, quotechar='"', doublequote=True)
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    print("Trying to read with a different engine...")
    try:
        df = pd.read_csv(csv_file, engine='python', quotechar='"', doublequote=True)
    except Exception as e_python:
        print(f"Error with python engine: {e_python}")
        print("Could not read the CSV file with either engine.")
        exit()


# Write to Excel
df.to_excel(xlsx_file, index=False, engine="openpyxl")

print(f"CSV file '{csv_file}' has been converted to Excel file '{xlsx_file}'")

CSV file 'Screened_Original_Metadata.csv' has been converted to Excel file 'Title_Abstract_Screening_128.xlsx'


# For Phase II, retriving Abstracts from Springer Nature Link data

Simple query-based data extraction does not provide with the abstract of the papers.

For this reason, we are usign API to scrap data and get abstract as well.

my springer link API key = 7702ae00e440dd5248edaa25791da70c

In [None]:
import requests, pandas as pd

# Replace with your actual API key or use Google Colab secrets
api_key = "7702ae00e440dd5248edaa25791da70c" # Make sure this is your valid API key
df = pd.read_csv("Springer.csv")

abstracts = []
for doi in df['Item DOI']:
    # Ensure the DOI is not a float or NaN
    if pd.notna(doi) and isinstance(doi, str):
        url = f"https://api.springernature.com/meta/v2/json?q=doi:{doi}&api_key={api_key}"
        r = requests.get(url).json()
        # Check if 'records' key exists in the response before accessing it
        if 'records' in r and r['records']:
            abstracts.append(r['records'][0].get('abstract', ''))
        else:
            abstracts.append('')
    else:
        abstracts.append('') # Append empty string if DOI is invalid

df['Abstract'] = abstracts
df.to_csv("springer_results_with_abstract.csv", index=False)

# Converting .bib file into .csv file

In [None]:
!pip install bibtexparser

Collecting bibtexparser
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bibtexparser
  Building wheel for bibtexparser (setup.py) ... [?25l[?25hdone
  Created wheel for bibtexparser: filename=bibtexparser-1.4.3-py3-none-any.whl size=43549 sha256=3543aebe525cdd9c2f6ec81eb4bf25b94be608995b87ccb058fe222fc5ac2b65
  Stored in directory: /root/.cache/pip/wheels/1f/7d/e9/1ff2509f13767a55df1279744adfb757f4ab94b2cbe761f56a
Successfully built bibtexparser
Installing collected packages: bibtexparser
Successfully installed bibtexparser-1.4.3


In [None]:
# ============================
# Convert .bib → .csv
# ============================

import bibtexparser
import pandas as pd

# 1. Path to your BibTeX file
input_bib = "DBLP.bib"
output_csv = "dblp.csv"

# 2. Load and parse the .bib file
with open(input_bib, encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

# 3. Each entry is a dictionary of all fields (title, author, abstract, etc.)
entries = bib_database.entries

# 4. Convert to DataFrame
df = pd.DataFrame(entries)

# 5. Optional: Reorder common columns for readability (others will still be preserved)
preferred_order = [
    "ID", "ENTRYTYPE", "title", "author", "year", "journal", "booktitle",
    "volume", "number", "pages", "doi", "url", "keywords", "abstract", "note"
]
cols = [c for c in preferred_order if c in df.columns] + [c for c in df.columns if c not in preferred_order]
df = df[cols]

# 6. Save to CSV
df.to_csv(output_csv, index=False, encoding="utf-8")

print(f"✅ Successfully converted {len(df)} records to '{output_csv}'")
print(f"Fields preserved: {len(df.columns)} columns")


✅ Successfully converted 711 records to 'dblp.csv'
Fields preserved: 24 columns


# Scrapping abstract for DBLP meta-data

In [None]:
# =====================================
# Enrich DBLP CSV with abstracts via DOI
# =====================================

import requests
import pandas as pd
import time

# === Configuration ===
input_csv = "dblp.csv"      # Your existing metadata file
output_csv = "dblp_with_abstracts.csv"
sleep_time = 0.5                    # delay between API calls (avoid rate-limit)

# === Load data ===
df = pd.read_csv(input_csv)

# Ensure DOI column exists
if 'doi' not in df.columns:
    raise ValueError("❌ No 'DOI' column found in the input CSV.")

# Prepare list for abstracts
abstracts = []

# === Function to get abstract from Semantic Scholar ===
def get_abstract_from_semanticscholar(doi):
    try:
        url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=title,abstract"
        r = requests.get(url, timeout=15)
        if r.status_code == 200:
            data = r.json()
            return data.get('abstract', '')
        return ''
    except Exception as e:
        print(f"Error fetching from Semantic Scholar for DOI {doi}: {e}")
        return ''

# === (Optional) Function to get abstract from Crossref if missing ===
def get_abstract_from_crossref(doi):
    try:
        url = f"https://api.crossref.org/works/{doi}"
        r = requests.get(url, timeout=15)
        if r.status_code == 200:
            data = r.json()
            abstract = data['message'].get('abstract', '')
            # Abstracts from Crossref often contain HTML tags like <jats:p>
            return abstract.replace('<jats:p>', '').replace('</jats:p>', '').strip()
        return ''
    except Exception as e:
        print(f"Error fetching from Crossref for DOI {doi}: {e}")
        return ''

# === Loop through all rows ===
for idx, row in df.iterrows():
    doi = str(row['doi']).strip()
    if not doi or doi.lower() == 'nan':
        abstracts.append('')
        continue

    print(f"[{idx+1}/{len(df)}] Fetching abstract for DOI: {doi}")

    abstract = get_abstract_from_semanticscholar(doi)

    # Fallback to Crossref if Semantic Scholar gives nothing
    if not abstract:
        abstract = get_abstract_from_crossref(doi)

    abstracts.append(abstract)
    time.sleep(sleep_time)

# === Add new column and save ===
df['Abstract'] = abstracts
df.to_csv(output_csv, index=False, encoding='utf-8')

print(f"\n✅ Done! Abstracts added for {sum(bool(a) for a in abstracts)} papers.")
print(f"💾 Saved enriched dataset to: {output_csv}")


[1/711] Fetching abstract for DOI: 10.1016/J.FUTURE.2025.108056
[2/711] Fetching abstract for DOI: 10.1016/J.JSS.2025.112641
[3/711] Fetching abstract for DOI: 10.1109/ACCESS.2025.3586203
[4/711] Fetching abstract for DOI: 10.1007/S10515-025-00500-0
[5/711] Fetching abstract for DOI: 10.1111/CGF.70123
[6/711] Fetching abstract for DOI: 10.1007/S10586-025-05383-0
[7/711] Fetching abstract for DOI: 10.1109/MCOM.002.2400276
[8/711] Fetching abstract for DOI: 10.1109/MC.2025.3546204
[9/711] Fetching abstract for DOI: 10.3389/FCOMP.2025.1519437
[10/711] Fetching abstract for DOI: 10.3389/FRAI.2025.1474017
[11/711] Fetching abstract for DOI: 10.1016/J.IJHCS.2025.103602
[12/711] Fetching abstract for DOI: 10.1016/J.INFSOF.2025.107832
[13/711] Fetching abstract for DOI: 10.1016/J.INFSOF.2025.107803
[14/711] Fetching abstract for DOI: 10.1109/JIOT.2024.3521425
[15/711] Fetching abstract for DOI: 10.1109/JIOT.2025.3531512
[16/711] Fetching abstract for DOI: 10.1186/S40537-025-01209-3
[17/711] Fe

# Extracting CSV data values from XLSX file

In [None]:
!pip install pandas openpyxl fuzzywuzzy python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python-Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 M

In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')

# Read the files
csv_file = 'unique_in_1152.csv'  # Replace with your CSV filename
xlsx_file = 'Title_Abstract_Screening_1152.xlsx'  # Replace with your Excel filename

print("Reading files...")
df_csv = pd.read_csv(csv_file)
df_xlsx = pd.read_excel(xlsx_file)

# Clean column names (remove extra spaces)
df_csv.columns = df_csv.columns.str.strip()
df_xlsx.columns = df_xlsx.columns.str.strip()

print(f"CSV file contains {len(df_csv)} entries")
print(f"Excel file contains {len(df_xlsx)} entries")

# Normalize text for better matching
def normalize_text(text):
    """Normalize text by converting to lowercase and removing extra spaces"""
    if pd.isna(text):
        return ""
    return str(text).lower().strip()

# Create normalized versions for matching
df_csv['title_norm'] = df_csv['title'].apply(normalize_text)
df_csv['doi_norm'] = df_csv['doi'].apply(normalize_text)

df_xlsx['Title_norm'] = df_xlsx['Title'].apply(normalize_text)
df_xlsx['DOI_norm'] = df_xlsx['DOI'].apply(normalize_text)

# Initialize results
results = []
matched_count = 0
unmatched_indices = []

print("\nMatching entries...")

# Match each CSV entry with Excel entries
for idx, csv_row in df_csv.iterrows():
    matched = False

    # First try: Match by DOI (most reliable)
    if csv_row['doi_norm']:
        doi_match = df_xlsx[df_xlsx['DOI_norm'] == csv_row['doi_norm']]
        if not doi_match.empty:
            matched = True
            matched_row = doi_match.iloc[0]

    # Second try: Match by title (exact match)
    if not matched and csv_row['title_norm']:
        title_match = df_xlsx[df_xlsx['Title_norm'] == csv_row['title_norm']]
        if not title_match.empty:
            matched = True
            matched_row = title_match.iloc[0]

    # Third try: Fuzzy matching on title (if exact match fails)
    if not matched and csv_row['title_norm']:
        max_ratio = 0
        best_match_idx = None

        for xlsx_idx, xlsx_row in df_xlsx.iterrows():
            ratio = fuzz.ratio(csv_row['title_norm'], xlsx_row['Title_norm'])
            if ratio > max_ratio:
                max_ratio = ratio
                best_match_idx = xlsx_idx

        # Use fuzzy match if similarity is high enough (>90%)
        if max_ratio > 90:
            matched = True
            matched_row = df_xlsx.iloc[best_match_idx]

    if matched:
        matched_count += 1
        results.append({
            'CSV_Index': idx,
            'Title': csv_row['title'],
            'DOI': csv_row['doi'],
            'Excluded': matched_row['Excluded'],
            'Reason': matched_row['Reason']
        })
    else:
        unmatched_indices.append(idx)
        results.append({
            'CSV_Index': idx,
            'Title': csv_row['title'],
            'DOI': csv_row['doi'],
            'Excluded': 'NOT_FOUND',
            'Reason': 'No match found in Excel file'
        })

# Create results dataframe
df_results = pd.DataFrame(results)

# Save results
output_file = 'matched_results.csv'
df_results.to_csv(output_file, index=False)

print(f"\n{'='*60}")
print(f"Matching completed!")
print(f"{'='*60}")
print(f"Total CSV entries: {len(df_csv)}")
print(f"Successfully matched: {matched_count}")
print(f"Unmatched entries: {len(unmatched_indices)}")
print(f"\nResults saved to: {output_file}")

if unmatched_indices:
    print(f"\nUnmatched CSV indices: {unmatched_indices[:10]}" +
          ("..." if len(unmatched_indices) > 10 else ""))

# Display summary statistics
print("\n" + "="*60)
print("Exclusion Summary:")
print("="*60)
if matched_count > 0:
    exclusion_counts = df_results[df_results['Excluded'] != 'NOT_FOUND']['Excluded'].value_counts()
    print(exclusion_counts)

    print("\nSample of matched results:")
    print(df_results.head(10).to_string())
else:
    print("No matches found!")

checking how many papers (out of 116 papers independently screened based for titles and abstract) are present in the common 1052 papers.

In [None]:
import pandas as pd
import re

def normalize_text(text):
    """Normalize text for comparison by converting to lowercase and removing extra whitespace"""
    if pd.isna(text) or text is None:
        return ""
    return re.sub(r'\s+', ' ', str(text).lower().strip())

def extract_doi(text):
    """Extract DOI from text, handling various formats"""
    if pd.isna(text) or text is None:
        return ""
    text = str(text).lower()
    # Remove common DOI prefixes
    text = text.replace('https://doi.org/', '').replace('http://doi.org/', '')
    text = text.replace('doi:', '').strip()
    return text

def check_matches(csv_file, xlsx_file):
    """
    Check how many XLSX entries are present in CSV file

    Parameters:
    csv_file: path to CSV file
    xlsx_file: path to XLSX file
    """

    # Read files
    print("Reading CSV file...")
    csv_df = pd.read_csv(csv_file)
    print(f"CSV file loaded: {len(csv_df)} rows")

    print("\nReading XLSX file...")
    xlsx_df = pd.read_excel(xlsx_file)
    print(f"XLSX file loaded: {len(xlsx_df)} rows")

    # Display column names
    print("\nCSV columns:", csv_df.columns.tolist())
    print("XLSX columns:", xlsx_df.columns.tolist())

    # Normalize CSV data (already lowercase according to user)
    csv_df['normalized_title'] = csv_df['Normalized Title'].apply(normalize_text)
    csv_df['normalized_doi'] = csv_df['doi'].apply(extract_doi)
    csv_df['normalized_abstract'] = csv_df['abstract'].apply(normalize_text)

    # Normalize XLSX data (convert to lowercase)
    xlsx_df['normalized_title'] = xlsx_df['Title'].apply(normalize_text)
    xlsx_df['normalized_doi'] = xlsx_df['DOI'].apply(extract_doi)
    xlsx_df['normalized_abstract'] = xlsx_df['Abstract'].apply(normalize_text)

    # Track matches
    matches = []
    match_methods = []

    print("\n" + "="*80)
    print("Checking for matches...")
    print("="*80)

    for idx, xlsx_row in xlsx_df.iterrows():
        matched = False
        match_method = ""

        # Method 1: Check by DOI (most reliable)
        if xlsx_row['normalized_doi'] and xlsx_row['normalized_doi'] != "":
            doi_match = csv_df[csv_df['normalized_doi'] == xlsx_row['normalized_doi']]
            if not doi_match.empty:
                matched = True
                match_method = "DOI"

        # Method 2: Check by exact title match
        if not matched and xlsx_row['normalized_title'] and xlsx_row['normalized_title'] != "":
            title_match = csv_df[csv_df['normalized_title'] == xlsx_row['normalized_title']]
            if not title_match.empty:
                matched = True
                match_method = "Exact Title"

        # Method 3: Check by partial title match (first 50 characters)
        if not matched and xlsx_row['normalized_title'] and len(xlsx_row['normalized_title']) > 50:
            title_prefix = xlsx_row['normalized_title'][:50]
            partial_match = csv_df[csv_df['normalized_title'].str.startswith(title_prefix)]
            if not partial_match.empty:
                matched = True
                match_method = "Partial Title"

        # Method 4: Check by abstract similarity (first 100 characters)
        if not matched and xlsx_row['normalized_abstract'] and len(xlsx_row['normalized_abstract']) > 100:
            abstract_prefix = xlsx_row['normalized_abstract'][:100]
            abstract_match = csv_df[csv_df['normalized_abstract'].str.startswith(abstract_prefix)]
            if not abstract_match.empty:
                matched = True
                match_method = "Abstract"

        matches.append(matched)
        match_methods.append(match_method if matched else "No Match")

        # Print progress every 20 rows
        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1}/{len(xlsx_df)} rows...")

    # Add results to XLSX dataframe
    xlsx_df['Matched'] = matches
    xlsx_df['Match_Method'] = match_methods

    # Summary statistics
    total_matches = sum(matches)
    match_rate = (total_matches / len(xlsx_df)) * 100

    print("\n" + "="*80)
    print("RESULTS SUMMARY")
    print("="*80)
    print(f"Total XLSX entries: {len(xlsx_df)}")
    print(f"Entries found in CSV: {total_matches}")
    print(f"Entries NOT found in CSV: {len(xlsx_df) - total_matches}")
    print(f"Match rate: {match_rate:.2f}%")

    print("\nMatches by method:")
    method_counts = xlsx_df['Match_Method'].value_counts()
    for method, count in method_counts.items():
        print(f"  {method}: {count}")

    # Show some examples of matches and non-matches
    print("\n" + "="*80)
    print("SAMPLE MATCHES (first 5)")
    print("="*80)
    matched_samples = xlsx_df[xlsx_df['Matched'] == True].head(5)
    for idx, row in matched_samples.iterrows():
        print(f"\n[{row['Match_Method']}] {row['Title'][:80]}...")

    print("\n" + "="*80)
    print("SAMPLE NON-MATCHES (first 5)")
    print("="*80)
    non_matched_samples = xlsx_df[xlsx_df['Matched'] == False].head(5)
    for idx, row in non_matched_samples.iterrows():
        print(f"\n{row['Title'][:80]}...")

    # Save detailed results
    output_file = 'match_results.xlsx'
    xlsx_df.to_excel(output_file, index=False)
    print(f"\n\nDetailed results saved to: {output_file}")

    return xlsx_df

# Usage
if __name__ == "__main__":
    # Replace these with your actual file paths
    csv_file = "duplicates1052.csv"
    xlsx_file = "Title_Abstract_Screening_joint.xlsx"

    try:
        results = check_matches(csv_file, xlsx_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("\nPlease update the file paths in the script:")
        print("  csv_file = 'duplicates1052.csv'")
        print("  xlsx_file = 'Title_Abstract_Screening_joint.xlsx'")
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

Reading CSV file...
CSV file loaded: 1052 rows

Reading XLSX file...
XLSX file loaded: 116 rows

CSV columns: ['title', 'authors', 'doi', 'year', 'abstract', 'Normalized Title', 'Normalized Authors']
XLSX columns: ['Title', 'Abstract', 'Authors', 'DOI', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', "Saad's choices"]

Checking for matches...
Processed 20/116 rows...
Processed 40/116 rows...
Processed 60/116 rows...
Processed 80/116 rows...
Processed 100/116 rows...

RESULTS SUMMARY
Total XLSX entries: 116
Entries found in CSV: 100
Entries NOT found in CSV: 16
Match rate: 86.21%

Matches by method:
  DOI: 100
  No Match: 16

SAMPLE MATCHES (first 5)

[DOI] The Distributed Situational Centers System as an Instrument of State and Corpora...

[DOI] Ambiguous regulations for dealing with AI in higher education can lead to moral ...

[DOI] Enablers, barriers and strategies for adopting new technology in accounting...

[DOI] Automated progress monitoring of construction projects usin