In [3]:
import os
import pdfplumber
from docx import Document
import spacy
from spacy import displacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [4]:

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def save_text_to_file(text, output_file):
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(text)

def main(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        input_file = os.path.join(input_folder, filename)
        output_file = os.path.join(output_folder, os.path.splitext(filename)[0] + ".txt")
        
        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(input_file)
        elif filename.endswith(".docx"):
            text = extract_text_from_docx(input_file)
        else:
            print(f"Ignoring file: {filename}. Unsupported format.")
            continue
        
        # Perform any additional processing using spaCy if needed
        # Example: doc = nlp(text)

        save_text_to_file(text, output_file)
        print(f"Converted {input_file} to {output_file}")

if __name__ == "__main__":
    input_folder = "C:\\Users\\anubh\\Downloads\\Slides_for_topics"
  # Specify the directory containing your input PDFs or DOCX files
    output_folder = "C:\\Users\\anubh\\Downloads\\Output"  # Specify the directory where you want to save the output text files
    main(input_folder, output_folder)

Converted C:\Users\anubh\Downloads\Slides_for_topics\Algorithm analysis.pdf to C:\Users\anubh\Downloads\Output\Algorithm analysis.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Exception Handelling.pdf to C:\Users\anubh\Downloads\Output\Exception Handelling.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Exception Handelling_Tutorial1.pdf to C:\Users\anubh\Downloads\Output\Exception Handelling_Tutorial1.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Graphs.pdf to C:\Users\anubh\Downloads\Output\Graphs.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Graphs_Tutorial2.pdf to C:\Users\anubh\Downloads\Output\Graphs_Tutorial2.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Hashmap.pdf to C:\Users\anubh\Downloads\Output\Hashmap.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Hashmap_Tutorial3.pdf to C:\Users\anubh\Downloads\Output\Hashmap_Tutorial3.txt
Converted C:\Users\anubh\Downloads\Slides_for_topics\Intelligent Agents.pdf to C:\User

In [5]:
import os
import spacy
import requests

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the directory containing the processed text files
directory = "C:\\Users\\anubh\\Downloads\\Output"

# Define the API endpoint and headers
url = "https://api.dbpedia-spotlight.org/en/annotate"
headers = {"accept": "application/json"}
ai_ml_keywords = {
    "artificial", "intelligence", "ai", "machine", "learning", "ml", "intelligent", "system",
    "programming", "algorithm", "algorithms", "algorithmic", "analysis", "analyze", "graph", 
    "graphs", "graphical", "network", "networks", "neural", "neuron", "neurons", "deep", 
    "data", "analysis", "analytics", "predict", "prediction", "predictive", "knowledge", 
    "knowledge-based", "text", "mining", "mine", "textual", "rdf", "content-based", 
    "recommendations", "recommendation", "collaborative", "filtering", "k-means", 
    "clustering", "cluster", "clustering-based", "pattern-matching", "match", 
    "matching", "bots", "bot", "natural", "language", "processing", "nlp", "inheritance", 
    "inherit", "tail", "recursion", "recursive", "efficiency", "efficient", "hash", "hashmap", 
    "table", "tables", "run", "runtime", "time", "errors", "error", "binary", "trees", 
    "tree", "binary-search-tree", "depth", "deep", "first", "search", "dfs", "intelligent", 
    "intelligence", "agents", "agent", "intelligent-agent"
}


# Function to filter named entities based on POS tags
def filter_named_entities(doc):
    named_entities = []
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE", "LOC", "WORK_OF_ART", "PRODUCT","FAC"]:
            named_entities.append(ent.text)
        elif ent.text.lower() in ai_ml_keywords:  # Check if the named entity is a computer science term
            named_entities.append(ent.text)
    return named_entities

# Output file path
output_file_path = "C:\\Users\\anubh\\Downloads\\output_annotations.txt"

# Open the output file for writing
with open(output_file_path, "w", encoding="utf-8") as output_file:
    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            # Read the processed document from the file with specified encoding
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as file:
                document_text = file.read()

            # Process the document with spaCy
            doc = nlp(document_text)

            # Filter named entities
            named_entities = filter_named_entities(doc)

            # Iterate over named entities and make requests to DBpedia Spotlight for each entity
            for entity in named_entities:
                # Make the GET request to DBpedia Spotlight
                params = {"text": entity}
                response = requests.get(url, params=params, headers=headers)

                # Check if the request was successful
                if response.status_code == 200:
                    # Parse the JSON response
                    data = response.json()
                    # Process the annotations
                    if "Resources" in data:
                        for resource in data["Resources"]:
                            # Write annotations to the output file
                            output_file.write("File: {}\n".format(filename))
                            output_file.write("Entity: {}\n".format(entity))
                            output_file.write("URI: {}\n".format(resource["@URI"]))
                            output_file.write("Surface Form: {}\n".format(resource["@surfaceForm"]))
                            output_file.write("Types: {}\n\n".format(resource["@types"]))
                            print("File:", filename)
                            print("Entity:", entity)
                            print("URI:", resource["@URI"])
                            print("Surface Form:", resource["@surfaceForm"])
                            print("Types:", resource["@types"])
                            print()
                else:
                    output_file.write("Error for entity {} in file {}: {}\n".format(entity, filename, response.status_code))
                    print("Error for entity {} in file {}: {}".format(entity, filename, response.status_code))


File: Algorithm analysis.txt
Entity: CPU
URI: http://dbpedia.org/resource/Central_processing_unit
Surface Form: CPU
Types: 

File: Algorithm analysis.txt
Entity: Windows
URI: http://dbpedia.org/resource/Microsoft_Windows
Surface Form: Windows
Types: Wikidata:Q7397,Wikidata:Q386724,Schema:CreativeWork,DBpedia:Work,DBpedia:Software

File: Algorithm analysis.txt
Entity: CPU
URI: http://dbpedia.org/resource/Central_processing_unit
Surface Form: CPU
Types: 

File: Algorithm analysis.txt
Entity: BS Trees
URI: http://dbpedia.org/resource/Bachelor_of_Science
Surface Form: BS
Types: 

File: Algorithm analysis.txt
Entity: Tree Performance
30Which DS
URI: http://dbpedia.org/resource/Nintendo_DS
Surface Form: DS
Types: Wikidata:Q1067263,DBpedia:Device,DBpedia:InformationAppliance

File: Algorithm analysis.txt
Entity: Java
URI: http://dbpedia.org/resource/Java
Surface Form: Java
Types: Wikidata:Q23442,Schema:Place,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Island

File: Algorithm

In [30]:
import csv
from rdflib import Graph, Literal, Namespace, RDF, URIRef
from rdflib.namespace import RDFS
import re

topic_graph = Graph()
ex = Namespace("http://example.org/")
topic_graph.bind("ex", ex)
topic_graph.bind("rdfs", RDFS)

# Define a function to format the TTL entry with additional properties
def format_ttl(entity, surface_form, uri, provenance, course, lecture):
    topic_uri = ex[surface_form.replace(" ", "_")]

    # Add triples for the topic
    topic_graph.add((topic_uri, RDF.type, ex.Topic))
    topic_graph.add((topic_uri, ex.TopicName, Literal(surface_form)))
    topic_graph.add((topic_uri, ex.provenance, Literal(provenance)))
    topic_graph.add((topic_uri, ex.dbpediaLink, URIRef(uri)))

    course_uri = ex[course.replace(' ', '_')]
    topic_graph.add((topic_uri, ex.topic_in_course, course_uri))
    lecture_uri = ex[lecture.replace(' ', '_')]
    topic_graph.add((topic_uri, ex.topic_in_lecture, lecture_uri))

    return topic_graph


# Create a dictionary to store lecture-course mappings
lecture_course_map = {}

# Read the course info from the CSV file
with open("Topic_information.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lecture_course_map[row['Lecture name']] = row['Course']

# Open the input file and read line by line
with open("C:\\Users\\anubh\\Downloads\\output_annotations.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Initialize variables to store entity, URI, surface form, and lecture
entity = ""
uri = ""
surface_form = ""
lecture = ""

# Iterate through the lines of the input file
for line in lines:
    # Remove leading/trailing whitespaces and check if the line is not empty
    line = line.strip()
    if line:
        # Extract entity, URI, and surface form
        if line.startswith("Entity:"):
            entity = line.split("Entity: ")[1]
        elif line.startswith("URI:"):
            uri = line.split("URI: ")[1]
        elif line.startswith("Surface Form:"):
            surface_form = line.split("Surface Form: ")[1]
        elif line.startswith("File:"):
            lecture = line.split("File: ")[1].strip()[:-4]  # Remove the last 4 characters (".txt")
            # Check if the lecture exists in the mapping
            if lecture in lecture_course_map:
                course = lecture_course_map[lecture]
            else:
                # Handle the case when the lecture is not found in the mapping
                lecture_without_suffix = re.sub(r'_tutorial\d|_worksheet\d|_Tutorial\d|_Worksheet\d', '', lecture)
                if lecture_without_suffix in lecture_course_map:
                      course = lecture_course_map[lecture_without_suffix]
                else:
                   print(f"Error: Lecture '{lecture}' not found in the course mapping.")
                continue
        # Determine provenance based on file name ending and lecture information
        if "tutorial" in lecture.lower():
            tutorial_num = re.search(r'_Tutorial(\d+)', lecture)
            if tutorial_num:
                tutorial_num = tutorial_num.group(1)
                provenance = f"Tutorial{tutorial_num}"
            else:
                provenance = "tutorial"
        elif "worksheet" in lecture.lower():
            worksheet_num = re.search(r'_Worksheet(\d+)', lecture)
            if worksheet_num:
                worksheet_num = worksheet_num.group(1)
                provenance = f"Worksheet{worksheet_num}"
            else:
                provenance = "worksheet"
        else:
            provenance = "lecture"
        # Check if entity, URI, surface form, and lecture are all non-empty
        if entity and uri and surface_form and lecture:
            # Format the TTL entry as RDF graph
            topic_graph = format_ttl(entity, surface_form, uri, provenance, course, lecture)
            # Add the triples to the main graph
            # Reset entity, URI, surface form, and lecture
            entity = ""
            uri = ""
            surface_form = ""
            lecture = ""

# Write the TTL content to an output file
with open("output_new.ttl", "w", encoding="utf-8") as file:
    file.write(topic_graph.serialize(format='turtle'))


File: Algorithm analysis.txt
Entity: BS Trees
URI: http://dbpedia.org/resource/Bachelor_of_Science
Surface Form: BS
Types: 

File: Algorithm analysis.txt
Entity: Tree Performance
30Which DS
URI: http://dbpedia.org/resource/Nintendo_DS
Surface Form: DS
Types: Wikidata:Q1067263,DBpedia:Device,DBpedia:InformationAppliance

File: Graphs.txt
Entity: Adjacency List
Southern Illinois University
URI: http://dbpedia.org/resource/Southern_Illinois_University
Surface Form: Southern Illinois University
Types: Wikidata:Q43229,Wikidata:Q3918,Wikidata:Q24229398,Wikidata:Q2385804,DUL:SocialPerson,DUL:Agent,Schema:Organization,Schema:EducationalOrganization,Schema:CollegeOrUniversity,DBpedia:Organisation,DBpedia:EducationalInstitution,DBpedia:Agent,DBpedia:University

File: Graphs.txt
Entity: NYU Computer Science
70Graphs
URI: http://dbpedia.org/resource/New_York_University
Surface Form: NYU
Types: 

File: Graphs.txt
Entity: NYU Computer Science
70Graphs
URI: http://dbpedia.org/resource/Computer_scienc

KeyboardInterrupt: 