# **Preprocessing**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('wordnet')
nltk.download('punkt_tab')

from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
#Project Data from Kaggle

import kagglehub
import os
path = kagglehub.dataset_download("niyarrbarman/symptom2disease")
print("Path to dataset files:", path)
df = pd.read_csv(os.path.join(path, 'Symptom2Disease.csv'))

In [None]:
#drop missing values
df.dropna(subset = ['label','text'], inplace = True)

In [None]:
#Remove stopwords from preprocessed text. Lemmatization is applied to reduce words to their base or root form.
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [None]:
#Preprocess data

def preprocess_text(text):
    # Check if the input is a string, if not, return an empty string
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #Tokenize the text
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)

In [None]:
#Check df
df.head(5)

In [None]:
# Medical Transcriptions Dataset
path2 = kagglehub.dataset_download("pasindueranga/disease-prediction-based-on-symptoms")
print("Path to dataset files:", path2)


In [None]:
df_mt = pd.read_csv(os.path.join(path2, 'dataset.csv'))

In [None]:
#Check medical transcription dataset
df_mt.head(5)

In [None]:
#Clean keywords column (can change to another column data if needed)
df_mt['clean_keywords'] = df_mt['symptoms'].apply(preprocess_text)

In [None]:
df_mt.head(5)

In [None]:
df_unique = df['label'].value_counts()
print('Number of Unique Disease:', df_unique)

In [None]:
df_mt_unique = df_mt['disease'].value_counts()
print('Number of Unique Disease:', df_mt_unique)

In [None]:
#wordcloud of all the diseases in the df dataset

from wordcloud import WordCloud
text = ' '.join(df['label'])
wordcloud= WordCloud (width = 500, height = 300, background_color = 'black').generate(text)
plt.figure(figsize = (12,4))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Word Cloud of Diseases')
plt.show()

In [None]:
#wordcloud of sympotos for each particular disease in the df dataset
for label in df['label'].unique():
  text = ' '.join(df[df['label'] == label ]['clean_text'])
  wordcloud= WordCloud (width = 400, height = 200, background_color = 'black').generate(text)

  plt.figure(figsize = (10,4))
  plt.imshow(wordcloud, interpolation = 'bilinear')
  plt.axis('off')
  plt.title(f'Word Cloud for Disease: {label}')
  plt.show()


In [None]:
#bigrams to understand the frequency of most common symptom patterns
from sklearn.feature_extraction.text import CountVectorizer
vector= CountVectorizer(ngram_range = (2,2))
X = vector.fit_transform(df['clean_text'])
sum_of_words = X.sum(axis = 0)
word_freqency = [(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
sort_words = sorted(word_freqency, key = lambda x: x[1], reverse = True)[:25]
df_bigram = pd.DataFrame(sort_words, columns = ['Bigram', 'Frequency'])
sns.barplot(x = 'Frequency', y = 'Bigram', data = df_bigram)
plt.title('Bigram Frequency')
plt.show()

In [None]:
#Setup text vectorization with custom variables
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75,min_df=5, use_idf=True, smooth_idf=True,sublinear_tf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(df['clean_text'].tolist() )
feature_names = sorted(vectorizer.get_feature_names_out())
print(feature_names)

In [None]:
#Setup PCA model
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())
labels = df['label'].tolist()
del feature_names[0:35]
category_list = df.label.unique()

In [None]:
#Data is then split into training and validation sets using the train_test_split function
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, labels, stratify=labels,random_state=1)
print('Train_Set_Size:'+str(X_train.shape))
print('Test_Set_Size:'+str(X_test.shape))

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

label2id = {label: int(idx) for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
id2label = {int(idx): label for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['text', 'label_id']].rename(columns={'label_id': 'label'}))
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# **Evaluations**

In [None]:
#checking column/row items
df.sample(1)

In [None]:
#checking column/row items
df_mt.sample(1)

In [None]:
#Import Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# KNN Model Training

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) based on your dataset
knn_classifier.fit(X_train, y_train)

In [None]:
# Predictions

predictions = knn_classifier.predict(X_test)

# **Model Evaluation**

In [None]:
# Model Evaluation

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, predictions))

# **Confusion Matrix**

In [None]:
#Import Modules
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Plotting confusion matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens', xticklabels=df['label'].unique(), yticklabels=df['label'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## **Example #1**

In [None]:
### Named Entity Recognition (NER) - spaCy, BERT, and Flair
# --- Named Entity Recognition with 3 Models ---

# 1. spaCy NER
import spacy
nlp_spacy = spacy.load("en_core_web_sm")

def extract_ner_spacy(text):
    doc = nlp_spacy(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# 2. Hugging Face BERT NER
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def extract_ner_bert(text):
    entities = ner_pipeline_bert(text)
    return [(ent['word'], ent['entity_group']) for ent in entities]

# 3. Flair NER
from flair.data import Sentence
from flair.models import SequenceTagger

tagger_flair = SequenceTagger.load("ner")

def extract_ner_flair(text):
    sentence = Sentence(text)
    tagger_flair.predict(sentence)
    return [(ent.text, ent.tag) for ent in sentence.get_spans('ner')]

# Apply all three models to the cleaned text
df['named_entities_spacy'] = df['clean_text'].apply(extract_ner_spacy)
df['named_entities_bert'] = df['clean_text'].apply(extract_ner_bert)
df['named_entities_flair'] = df['clean_text'].apply(extract_ner_flair)

# Show sample output
df[['clean_text', 'named_entities_spacy', 'named_entities_bert', 'named_entities_flair']].head()



In [None]:
#Import Modules
from sklearn.feature_extraction.text import TfidfVectorizer

# Example Usage
example_symptom_1 = df_mt['clean_keywords'][16]

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(example_symptom_1)

# Transform the preprocessed symptom using the same vectorizer used during training
#symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(tfIdfMat_reduced)
actual_disease = df_mt['disease'][16]

# Print the results
print(f'Symptoms: {example_symptom_1}')
print(f'Predicted Disease: {predicted_disease[0]}')
print(f'Actual Disease: {actual_disease}')

## **Example #2**

In [None]:
#Import Modules
from sklearn.feature_extraction.text import TfidfVectorizer

# Example Usage
example_symptom_1 = "high fever"

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(example_symptom_1)

# Transform the preprocessed symptom using the same vectorizer used during training
#symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(tfIdfMat_reduced)
#actual_disease = df_mt['disease'][16]

# Print the results
print(f'Symptoms: {example_symptom_1}')
print(f'Predicted Disease: {predicted_disease[105]}')
print(f'Actual Disease: {actual_disease}')

## **Example #3**

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

def train_model(model, train_dataset, eval_dataset, tokenizer, output_dir="./results"):
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir=output_dir + "/logs",
        logging_steps=10,
        report_to="none",
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    trainer.train()
    return trainer

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(trainer, eval_dataset, label2id):
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    true_labels = predictions.label_ids
    target_names = list(label2id.keys())

    print("Accuracy:", accuracy_score(true_labels, preds))
    print("Confusion Matrix:\n", confusion_matrix(true_labels, preds))
    print("Classification Report:\n", classification_report(true_labels, preds, target_names=target_names))

    # Confusion Matrix Plot
    plt.figure(figsize=(10, 8))
    sns.heatmap(confusion_matrix(true_labels, preds), annot=True, fmt="d",
                xticklabels=target_names, yticklabels=target_names, cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

    # Bar Plot of Precision, Recall, F1
    report_dict = classification_report(true_labels, preds, target_names=target_names, output_dict=True)
    metrics = ["precision", "recall", "f1-score"]
    for metric in metrics:
        values = [report_dict[label][metric] for label in target_names]
        plt.figure(figsize=(10, 6))
        plt.bar(target_names, values)
        plt.title(f"{metric.title()} per Class")
        plt.ylim(0, 1)
        plt.ylabel(metric.title())
        plt.xlabel("Class")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
trainer = train_model(model, tokenized_dataset['train'], tokenized_dataset['test'], tokenizer)
evaluate_model(trainer, tokenized_dataset['test'], label2id)

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def predict_disease_biobert(text, id2label):
    result = classifier(text)[0]
    disease_name = result['label']
    score = result['score']
    return disease_name, score

In [None]:
# Import module (already done earlier)
from sklearn.feature_extraction.text import TfidfVectorizer

# Example symptom input
symptom = df_mt['clean_keywords'][88]

# Preprocess the symptom
preprocessed_symptom = preprocess_text(symptom)

# Predict the disease using the BioBERT pipeline
predicted_disease, confidence = predict_disease_biobert(preprocessed_symptom, id2label)

# Print results
print(f'Symptoms: {symptom}')
print(f'Predicted Disease: {predicted_disease} (Confidence: {confidence:.2f})')