# The main Model

In [1]:
import re
import tensorflow as tf
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Tokenize text
text = "Machine Learning is important for AI research!"
cleaned_text = clean_text(text)
tokens = tokenizer.tokenize(cleaned_text)

print(tokens)


  from .autonotebook import tqdm as notebook_tqdm


['machine', 'learning', 'is', 'important', 'for', 'ai', 'research']


In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_wikipedia(topic):
    search_url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}"
    response = requests.get(search_url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join([p.text for p in paragraphs[:5]])  # Get first 5 paragraphs
        return text
    else:
        return "Sorry, I couldn't find relevant study material."
    return response


In [3]:
import gradio as gr
modelUI = gr.Interface(
    fn = scrape_wikipedia,
    inputs = 'text',
    outputs = gr.Textbox(label = 'The answer to the question requested is')
)
modelUI.launch()

--------


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [4]:
topic = input("Enter the topic")
study_material = scrape_wikipedia(topic)
print(study_material)

Enter the topic Machine Learning


Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Within a subdiscipline in machine learning, advances in the field of deep learning have allowed neural networks, a class of statistical algorithms, to surpass many previous machine learning approaches in performance.[2]
 ML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.[3][4] The application of ML to business problems is known as predictive analytics.
 Statistics and mathematical optimization (mathematical programming) methods comprise the foundations of machine learning. Data mining is a related field of study, focusing on exploratory data analysis (EDA) via unsupervised learning.[6][7]
 From a theoretical viewpoint, probably approxima

In [5]:
import pyttsx3
engine = pyttsx3.init()
engine.say(study_material)
engine.runAndWait()

In [6]:
from serpapi import GoogleSearch

def google_search(query):
    params = {
        "engine": "google",
        "q": query + " site:edu",
        "api_key": "4c3a39f406bb56f40ea28e7b2b59186206d9eb054c352ef8e19552d9900611f2",
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    
    links = [result["link"] for result in results["organic_results"][:3]]
    return links

# Example usage
query = "Machine Learning course material"
search_results = google_search(query)
print(search_results)


['https://cedar.buffalo.edu/~srihari/CSE574/', 'https://online.stanford.edu/courses/xcs229-machine-learning', 'https://ocw.mit.edu/courses/6-036-introduction-to-machine-learning-fall-2020/']


In [7]:
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    paragraphs = soup.find_all("p")
    text = " ".join([p.text for p in paragraphs[:5]])
    return text

# Example usage
for link in search_results:
    study_material = scrape_website(link)
    print(study_material)




Reference textbooks for different parts of the course are "Pattern Recognition and Machine Learning" by Chris Bishop (Springer 2006) and  "Probabilistic Graphical Models" by Daphne Koller and Nir Friedman (MIT Press 2009) and "Deep Learning" by Goodfellow, Bengio and Courville (MIT Press 2016).




 

 

 
Course topics are listed below with links to lecture slides and lecture videos. 



The course is followed by two other courses, one focusing on Probabilistic Graphical Models

and another on Deep Learning. 


The slides and videos were last updated in Fall 2020. 
Chapters 1-17 (Topic titles in Red) are more recently taught versions. 
 

The course is followed by two other courses, one focusing on Probabilistic Graphical Models

and another on Deep Learning. 


The slides and videos were last updated in Fall 2020. 
Chapters 1-17 (Topic titles in Red) are more recently taught versions. 


This course introduces principles, algorithms, and applications of machine learning from the po

In [10]:
from datasets import load_dataset

# Load Stanford Q&A dataset
squad = load_dataset("squad")


print(squad['train'][0])


{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [None]:
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text


pdf_text = extract_text_from_pdf("C:/Users/Home/Desktop/LearningManagementAI/LearningManagementAI/Docs/past_paper.pdf")
print(pdf_text)


In [None]:
import pyttsx3
engine = pyttsx3.init()
engine.say(pdf_text)
engine.runAndWait()

In [None]:
import json

def save_to_json(data, filename="study_data.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

# Example usage
study_data = {"topic": "Machine Learning", "content": pdf_text}
save_to_json(study_data)


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\W+", " ", text)
    text = re.sub(r"\d+", "", text)
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    
    return " ".join(tokens)

# Example usage
raw_text = "Machine learning is a field of AI. Visit https://example.com for more!"
cleaned_text = clean_text(raw_text)
print(cleaned_text)


machine learning field ai visit https example com


In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

course_outline_text = extract_text_from_pdf("C:/Users/Home/Desktop/Comp3.1/Automata Theory303/COMP 303  AUTOMATA THEORY  COURSE OUTLINE.pdf")
print(course_outline_text)


KISII UNIVERSITY
Faculty of Information Science and Technology (SIST)
Department of Computing Sciences
YEAR 3 SEM 1 BSC COMPUTER SCIENCE AND APPLIED COMPUTER
SCIENCE
SEPT-DEC, 2024
COURSE OUTLINE
Course Details
Course Code COMP 303
Course Name AUTOMATA THEORY
Credit Hours 3.5
Day/Time/Location THURSDAYS / 9am – 11 am / TC G2
Lecturer Silas Momanyi Nyabuga
Email Address smnyabuga@gmail.com
Cell 0722-891-892
Contact Hours: Lectures 30 and Practicals/Tutorials 30
Purpose of the course:
The course will provide students with knowledge and skills regarding
fundamental concepts of Finite Automata, Regular Languages, and Pushdown
Automata before moving onto Turing machines and Decidability.
Expected Learning outcomes of the Course:
Upon successful completion of this course, the student will be able to:
i. Prove properties of languages, grammars and automata with rigorously
formal mathematical methods;
ii. Design automata, regular expressions and context-free grammars
accepting or generating a 

In [7]:
from docx import Document

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

course_outline_text = extract_text_from_docx("C:/Users/Home/Desktop/Course Outline.docx")
print(course_outline_text)



WEB PROGRAMMING II (30/30; CF 3.5) Y3S2
Prerequisite: BIT 202
Contact Hours: Lectures 30 and Practical/Tutorials 30


Purpose of the course:
To provide the student with the knowledge and skills to construct dynamic and interactive websites for various types of business applications.
Expected learning outcome
By the end of the course units the learner should:
Have a practical experience in the use of programming and scripting languages for web. Development
Develop back end applications using JavaScript libraries such as Nodes.js, 
Develop a Data-Driven Node.js Web App and connect with the web database
Develop dynamic and interactive web applications using various types of business information
Each student will come up with a small website with full functionalities of a website
Course Content
Server Side Web application development using scripting languages such as Node.js, PHP, etc Node.js frameworks, Installing Node.js, NodeJS Module System, Node.js as a File Server, Multiple exports,

In [8]:
import pdfplumber
import docx
import pytesseract
import cv2
import re
import numpy as np
from PIL import Image

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

def extract_course_info(text):
    info = {
        "Course Title": None,
        "Course Duration": None,
        "Weekly Breakdown": {},
        "Objectives": [],
        "Recommended Materials": []
    }

    lines = text.split("\n")

    course_title_pattern = re.compile(r"(?i)Course\s*Title:\s*(.+)")
    duration_pattern = re.compile(r"(?i)Duration:\s*(.+)")
    week_pattern = re.compile(r"(?i)Week\s*(\d+):?\s*(.*)")
    objectives_pattern = re.compile(r"(?i)Objectives?")
    materials_pattern = re.compile(r"(?i)Recommended\s*Materials?")

    current_section = None

    for line in lines:
        line = line.strip()
        
        title_match = course_title_pattern.search(line)
        if title_match:
            info["Course Title"] = title_match.group(1)
            continue

        duration_match = duration_pattern.search(line)
        if duration_match:
            info["Course Duration"] = duration_match.group(1)
            continue

        week_match = week_pattern.search(line)
        if week_match:
            week_number = int(week_match.group(1))
            week_content = week_match.group(2).strip()
            info["Weekly Breakdown"][week_number] = week_content
            continue

        if objectives_pattern.search(line):
            current_section = "Objectives"
            continue
        elif materials_pattern.search(line):
            current_section = "Recommended Materials"
            continue

        if current_section == "Objectives":
            info["Objectives"].append(line)
        elif current_section == "Recommended Materials":
            info["Recommended Materials"].append(line)

    return info

def process_course_outline(file_path):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Use PDF or DOCX.")

    return extract_course_info(text)

# Example Usage
file_path = "C:/Users/Home/Desktop/Comp 3.2/Comp 302/SOEN 302 - COMP 302 course outline.docx" 
course_data = process_course_outline(file_path)

print(course_data)


{'Course Title': None, 'Course Duration': None, 'Weekly Breakdown': {}, 'Objectives': ['At the end of the course, students should be able to: Analyze and Design Algorithms, compare the performance of Algorithms.', 'Course content', 'Developing the skills of analysing the behaviour of algorithms. Detailed study of the basic notions of the design of algorithms and the underlying data structures. Major topics: the analysis with respect to average and worst case bahaviour and correctness of algorithms for internal sorting, pattern matching on strings, graph algorithms and methods such as recursive elimination, dynamic programming and program profiling. It will also cover Complexity problem, Structure, complexity and efficiency of algorithms. Examples are taken from numerical computations.', 'Mode of delivery', 'The course will be taught by using lectures, tutorials and assignments.', 'Instructional resources', 'Resource persons, textbooks, hand-outs, LCD projectors, laptops/computers, disc

In [9]:
import pdfplumber
import docx
import pytesseract
import cv2
import re
import numpy as np
from PIL import Image

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

def extract_course_info(text):
    info = {
        "Course Title": None,
        "Course Code": None,
        "Course Duration": None,
        "Weekly Breakdown": {},
        "Objectives": [],
        "Recommended Materials": []
    }

    lines = text.split("\n")

    course_title_pattern = re.compile(r"(?i)([A-Z]{3,4}\s*\d{3})\s*[-:]?\s*(.+)")
    duration_pattern = re.compile(r"(?i)Duration:\s*(.+)")

    week_pattern = re.compile(r"(?i)(?:Week\s*|^)(\d+)[\.:]?\s*(.*)")
    bullet_pattern = re.compile(r"^[•*-]\s*(.+)")

    objectives_pattern = re.compile(r"(?i)Objectives?")
    materials_pattern = re.compile(r"(?i)Recommended\s*Materials?")

    current_section = None
    week_counter = 0

    for line in lines:
        line = line.strip()

        title_match = course_title_pattern.search(line)
        if title_match:
            info["Course Code"] = title_match.group(1)
            info["Course Title"] = title_match.group(2)
            continue

        duration_match = duration_pattern.search(line)
        if duration_match:
            info["Course Duration"] = duration_match.group(1)
            continue

        week_match = week_pattern.search(line)
        if week_match:
            week_number = int(week_match.group(1))
            week_content = week_match.group(2).strip()
            info["Weekly Breakdown"][week_number] = week_content
            continue

        bullet_match = bullet_pattern.search(line)
        if bullet_match:
            week_counter += 1
            info["Weekly Breakdown"][week_counter] = bullet_match.group(1)
            continue

        if objectives_pattern.search(line):
            current_section = "Objectives"
            continue
        elif materials_pattern.search(line):
            current_section = "Recommended Materials"
            continue

        if current_section == "Objectives":
            info["Objectives"].append(line)
        elif current_section == "Recommended Materials":
            info["Recommended Materials"].append(line)

    return info

def process_course_outline(file_path):
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Use PDF or DOCX.")

    return extract_course_info(text)

file_path = "C:/Users/Home/Desktop/Comp 3.2/Comp 306/COMP 306 CO.pdf" 
course_data = process_course_outline(file_path)

print(course_data)


{'Course Title': 'ADVANCED DATABASE SYSTEMS', 'Course Code': 'COMP 306', 'Course Duration': None, 'Weekly Breakdown': {1: 'MYSQL for relational databases', 2: 'MongoDB for NoSQL databases', 3: 'Date, C.(2006). An Introduction to Database Systems, 8 edition. Pearson publisher.', 4: 'Bradshaw, S., Brazil, E., Chodorow, K.(2019) MongoDB: The Definitive Guide:', 5: 'Distributed database management system', 6: 'NOSQL Databases', 7: 'Cloud Databases and Database as a Service (DBaaS)', 8: 'Emerging Trends and Research Topics', 978: '-0321826626'}, 'Objectives': [], 'Recommended Materials': []}


In [10]:
import re

def extract_weeks_and_topics(text):
    weeks = re.findall(r"Week \d+: (.+)", text)
    return weeks

weekly_topics = extract_weeks_and_topics(course_outline_text)
print(weekly_topics)


[]


In [2]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from serpapi import GoogleSearch

def scrape_study_materials(topic):
    api_key = "4c3a39f406bb56f40ea28e7b2b59186206d9eb054c352ef8e19552d9900611f2"
    search = GoogleSearch({
        "q": f"{topic} study materials",
        "hl": "en",
        "gl": "us",
        "api_key": api_key
    })
    
    results = search.get_dict()
    links = []

    for result in results.get("organic_results", []):
        if "link" in result:
            links.append(result["link"])
    
    return links[:5]

def summarize_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        return article.summary
    except Exception as e:
        return f"Error processing article: {e}"

study_materials = scrape_study_materials("Machine Learning")

if study_materials:
    summary = summarize_article(study_materials[0])
    print("Summary of the first article:\n", summary)
else:
    print("No study materials found.")


Summary of the first article:
 Welcome to r/learnmachinelearning - a community of learners and educators passionate about machine learning!
This is your space to ask questions, share resources, and grow together in understanding ML concepts - from basic principles to advanced techniques.
Whether you're writing your first neural network or diving into transformers, you'll find supportive peers here.
For ML research, /r/machinelearning For resume review, /r/engineeringresumes For ML engineers, /r/mlengineeringMembers Online
