In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
import wikipediaapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import wikipedia
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing wikipedia API and writing a function to get the text from wikipedia by the title.

In [10]:
wiki_wiki = wikipediaapi.Wikipedia('english')

# Function to retrieve a text from a Wikipedia page
def get_wikipedia_text(page_title):
    page = wiki_wiki.page(page_title)
    
    if not page.exists():
        return None
    
    return page.text


From the documents we recognize the mostly repeated words and consider them as keywords and the features.

In [11]:
# Function to extract keywords using NLTK
def extract_keywords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    return filtered_words


From  the keywords we ectract the nouns using NLTK library.

In [12]:
# Function to extract nouns from keywords using NLTK
def extract_nouns(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    nouns = [word.lower() for word, pos in tagged_words if pos.startswith('N') and word.lower() not in stop_words and word.isalnum()]
    return nouns

We use the 'FreqDist' class from NLTK to choose the 10 most common keywords for both medical and non-medical topics.

In [13]:
# Function to extract top nouns for a given set of topics
def extract_top_nouns(topics, num_top_nouns=10):
    all_nouns = []

    for topic in topics:
        text = get_wikipedia_text(topic)
        if text:
            all_nouns.extend(extract_nouns(text))

    nouns_freq_dist = FreqDist(all_nouns)
    top_nouns = [word for word, _ in nouns_freq_dist.most_common(num_top_nouns)]

    return top_nouns


We use some sampled annotated keywords which we got from wikipedia in order to collect documents for medical and non_medical topics.

In [14]:
# Sample annotated keywords for medical and non-medical topics
medical_topics = ['diabetes', 'blood pressure', 'cancer', 'heart disease', 'vaccine', 'pandemic']
non_medical_topics = ['programming', 'technology', 'history', 'art', 'sports', 'entertainment']

For each documents(medical or non_medical) there are some words which are repeated more and can be used to seperate the documents. 

In [15]:
# Extract top medical and non-medical nouns
top_medical_keywords = extract_top_nouns(medical_topics, num_top_nouns=10)
top_non_medical_keywords = extract_top_nouns(non_medical_topics, num_top_nouns=10)

In [16]:
# Display the top medical nouns
print("Top Medical Nouns:", top_medical_keywords)

Top Medical Nouns: ['cancer', 'disease', 'pressure', 'blood', 'risk', 'diabetes', 'vaccine', 'vaccines', 'people', 'heart']


In [17]:
# Display the top non medical nouns
print("Top Non Medical Nouns:", top_non_medical_keywords)

Top Non Medical Nouns: ['art', 'history', 'entertainment', 'example', 'sports', 'century', 'technology', 'sport', 'forms', 'world']


-Documents related to medical topics are labeled as 1, and documents related to non-medical topics are labeled as 0.

-CountVectorizer is then used with vocabulary=all_top_nouns to create a bag-of-words representation of the all_docs (documents) using only the specified vocabulary.By doing this, we are creating a feature matrix X where each row corresponds to a document, and each column corresponds to the count of occurrences of a specific top noun in that document.


-This approach allows us to focus on a subset of important words (top nouns), making the feature matrix more manageable and potentially more informative for your classification task. Adjusting the num_top_nouns parameter when extracting top nouns allows us to control the number of features included in our analysis.

In [23]:
all_topics = medical_topics + non_medical_topics

# Retrieve and preprocess text for all topics

all_docs = []
all_labels = []

for topic in all_topics:
    text = get_wikipedia_text(topic)
    if text:
        nouns = extract_nouns(text)
        all_docs.append(" ".join(nouns))
        all_labels.append(1 if topic in medical_topics else 0)

# Extract features using top medical and non-medical nouns

all_top_keywords = top_medical_keywords + top_non_medical_keywords
vectorizer = CountVectorizer(vocabulary=all_top_keywords)
X = vectorizer.transform(all_docs)
y = all_labels

Now we split the data for training and testing using Sklearn models.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

First we are using a naive classifier for classification.

One common type of Naive Classifier is the Majority Class Classifier, which predicts the majority class for all instances

In [24]:
#a majority class naive classifier 
class NaiveClassifier:
    def __init__(self):
        self.majority_class = None

    def fit(self, X, y):
        # Find the majority class in the training data
        unique_classes, counts = np.unique(y, return_counts=True)
        self.majority_class = unique_classes[np.argmax(counts)]

    def predict(self, X):
        # Predict the majority class for all instances in the test data
        return np.full(X.shape[0], self.majority_class)

# Example usage
naive_classifier = NaiveClassifier()

# Fit the classifier on the training data
naive_classifier.fit(X_train, y_train)

# Make predictions on the test set
naive_predictions = naive_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, naive_predictions)
print("Naive Classifier Accuracy:", accuracy)

Naive Classifier Accuracy: 0.3333333333333333


We can also use a logistic regression model from Sklearn with better accuracy.

In [25]:
model = LogisticRegression(fit_intercept=True)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


Now that our classification model is ready we can use it to classify a significant document. I downloaded a medical document from wikipedia(which is in the zip) and gave it to the model to classify it. we can also get it directly from wikipeida but i also wanted to add the code for converting pdf format to txt format.

In [29]:
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import PyPDF2


# Function for preprocessing text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Add more preprocessing steps as needed
    return text

# Function to vectorize a document using the provided vectorizer
def vectorize_document(document, vectorizer):
    preprocessed_document = preprocess_text(document)
    document_vectorized = vectorizer.transform([preprocessed_document])
    return document_vectorized

# Fit the Naive Classifier on the training data
naive_classifier.fit(X_train, y_train)

# Specify the path to the PDF file on your desktop
pdf_file_path = "C:\wiki\Medicine.pdf" 

# Read the content of the PDF file
with open(pdf_file_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    pdf_text = ''
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        pdf_text += page.extract_text()

# Assuming you have a vectorizer with the same vocabulary used during training
vectorizer = CountVectorizer(vocabulary=all_top_keywords)  # Use the same vocabulary

# Vectorize the PDF document
document_vectorized = vectorize_document(pdf_text, vectorizer)

# Make a prediction using the Naive Classifier
naive_prediction = naive_classifier.predict(document_vectorized)

if naive_prediction==1:
    print('the document is medical')
else:
    print('the document is non-medical')
    

the document is medical
