In [14]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Verify the downloads
import nltk.data
try:
    nltk.data.find('tokenizers/punkt')
    print("Punkt tokenizer is available.")
except LookupError:
    print("Punkt tokenizer is NOT available.")

try:
    nltk.data.find('sentiment/vader_lexicon.zip')
    print("VADER lexicon is available.")
except LookupError:
    print("VADER lexicon is NOT available.")

Punkt tokenizer is available.
VADER lexicon is available.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shyla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shyla\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shyla\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\shyla\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\shyla\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [44]:
import os
import re
import spacy
from textblob import TextBlob
import PyPDF2

# Load the English language model for SpaCy
nlp = spacy.load("en_core_web_sm")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# Function to extract specific sections (Abstract, Conclusion, Discussion) case-insensitive
def extract_sections(text):
    sections = {
        "Abstract": "",
        "Conclusion": "",
        "Discussion": ""
    }
    
    for section in sections:
        pattern = rf"{section}(.*?)(?=\n(?:{'|'.join(sections.keys())})|$)"
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)  # Using re.IGNORECASE for case-insensitive matching
        if match:
            sections[section] = match.group(1).strip()
    
    return sections

# Function to highlight matched keywords in the sentence
def highlight_keywords(sentence, keywords):
    for keyword in keywords:
        sentence = re.sub(rf"\b({keyword})\b", r"**\1**", sentence, flags=re.IGNORECASE)
    return sentence

# Function to extract relevant sentences containing AI or disability-related terms
def extract_relevant_sentences(doc):
    technology_terms = ['ai', 'artificial intelligence', 'machine learning', 'robotics', 'biotechnology', 'augmented reality', '6g', 'quantum computing', 'virtual reality', 'sustainability', 'robot', 'drone', 'circular economy']
    disability_terms = ['disability', 'disabled', 'disabilities', 'pwd', 'accessibility', 'people with disabilities']

    relevant_sentences = []
    for sent in doc.sents:
        contains_technology = any(token.text.lower() in technology_terms for token in sent)
        contains_disability = any(token.text.lower() in disability_terms for token in sent)
        
        # Only add sentences containing both technology and disability terms
        if contains_technology and contains_disability:
            matched_technology = [token.text for token in sent if token.text.lower() in technology_terms]
            matched_disability = [token.text for token in sent if token.text.lower() in disability_terms]
            
            highlighted_sentence = highlight_keywords(sent.text, matched_technology + matched_disability)
            
            relevant_sentences.append({
                "sentence": highlighted_sentence,
                "matched_technology": matched_technology,
                "matched_disability": matched_disability
            })
    
    return relevant_sentences

# Function to classify sentiment (positive, neutral, or negative) and provide explanation
def classify_sentiment(sentence):
    polarity = TextBlob(sentence).sentiment.polarity
    explanation = ""
    
    if polarity > 0.1:
        sentiment = 'Positive'
        explanation = "The sentence expresses a positive view or optimistic outcomes, likely due to benefits AI brings for disabled people."
    elif polarity < -0.1:
        sentiment = 'Negative'
        explanation = "The sentence expresses concerns, challenges, or disadvantages, such as risks AI might pose to disabled individuals."
    else:
        sentiment = 'Neutral'
        explanation = "The sentence is factual or balanced without clear positive or negative emotion."
    
    return sentiment, explanation

# Analyze each paper and classify sentiment, only in specific sections
def analyze_paper(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return None  # Skip if the PDF is problematic
    
    # Extract sections (Abstract, Introduction, Discussion, Conclusion)
    sections = extract_sections(text)
    
    analysis = {
        "filename": os.path.basename(pdf_path),
        "relevant_sentences": [],
        "sentiments": {"Positive": 0, "Neutral": 0, "Negative": 0}
    }
    
    # Only process these sections
    for section, content in sections.items():
        if content:  # If the section has content
            doc = nlp(content)  # Process only this section's content
            relevant_sentences = extract_relevant_sentences(doc)
            
            for sentence_info in relevant_sentences:
                sentence = sentence_info["sentence"]
                sentiment, sentiment_explanation = classify_sentiment(sentence)
                explanation = f"Matched technology terms: {', '.join(sentence_info['matched_technology'])}; " \
                              f"Matched disability terms: {', '.join(sentence_info['matched_disability'])}."
                
                analysis["relevant_sentences"].append({
                    "section": section,
                    "sentence": sentence,
                    "sentiment": sentiment,
                    "sentiment_explanation": sentiment_explanation,
                    "explanation": explanation
                })
                analysis["sentiments"][sentiment] += 1
    
    return analysis

# Function to analyze all PDFs in a folder, focusing on specific sections
def analyze_folder(folder_path):
    all_analyses = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            analysis = analyze_paper(pdf_path)
            if analysis:  # Only add if analysis was successful
                all_analyses.append(analysis)
                print(f"\nAnalysis for {filename}:")
                print(f"Sentiment distribution: {analysis['sentiments']}")
                for result in analysis["relevant_sentences"]:
                    print(f"Section: {result['section']}")
                    print(f"Sentence: {result['sentence']}")
                    print(f"Sentiment: {result['sentiment']}")
                    print(f"Explanation: {result['explanation']}")
                    print(f"Sentiment Explanation: {result['sentiment_explanation']}")
                    print("\n")
    
    # Summary
    total_sentiments = {"Positive": 0, "Neutral": 0, "Negative": 0}
    for analysis in all_analyses:
        for sentiment in analysis["sentiments"]:
            total_sentiments[sentiment] += analysis["sentiments"][sentiment]
    
    print("\nOverall Sentiment Distribution Across All Papers:")
    print(total_sentiments)

# Usage
folder_path = 'AI'  # Provide the path to your folder of review papers
analyze_folder(folder_path)



Analysis for 1-s2.0-S0363018824001130-main.pdf:
Sentiment distribution: {'Positive': 0, 'Neutral': 0, 'Negative': 0}

Analysis for 3362077.3362086.pdf:
Sentiment distribution: {'Positive': 11, 'Neutral': 22, 'Negative': 8}
Section: Abstract
Sentence: We describe some of the opportunities and risks across four emerg-ing **AI** application areas: employment, educa-tion, public safety, and healthcare, identiﬁed in a workshop with participants experiencing a range of **disabilities**.
Sentiment: Neutral
Explanation: Matched technology terms: AI; Matched disability terms: disabilities.
Sentiment Explanation: The sentence is factual or balanced without clear positive or negative emotion.


Section: Abstract
Sentence: We next discuss strategies for supporting fair-ness in the context of **disability** throughout the **AI** development lifecycle.
Sentiment: Positive
Explanation: Matched technology terms: AI; Matched disability terms: disability.
Sentiment Explanation: The sentence expresses a