Depending on which objective we want to focus on, here are some framework options we can consider
1. Symptom summarization:
2. Medical Named Entity Recognition (NER): BioBERT/NER, model options en_core_sci_md

    import spacy

    import scispacy

    nlp = spacy.load('en_core_sci_md')

3. Symptom to Diagnosis Classification: knn/regression/random forests via sklearn

**Preprocessing**

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('wordnet')
nltk.download('punkt_tab')

from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
#Project Data from Kaggle

import kagglehub
import os
path = kagglehub.dataset_download("niyarrbarman/symptom2disease")
print("Path to dataset files:", path)
df = pd.read_csv(os.path.join(path, 'Symptom2Disease.csv'))

In [None]:
#drop missing values
df.dropna(subset = ['label','text'], inplace = True)

In [None]:
#Remove stopwords from preprocessed text. Lemmatization is applied to reduce words to their base or root form.
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [None]:
#Preprocess data

def preprocess_text(text):
    # Check if the input is a string, if not, return an empty string
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #Tokenize the text
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)

In [None]:
#Check df
df.head(5)

In [None]:
# Medical Transcriptions Dataset
path2 = kagglehub.dataset_download("pasindueranga/disease-prediction-based-on-symptoms")
print("Path to dataset files:", path2)


In [None]:
df_mt = pd.read_csv(os.path.join(path2, 'dataset.csv'))

In [None]:
#Check medical transcription dataset
df_mt.head(5)

In [None]:
#Clean keywords column (can change to another column data if needed)
df_mt['clean_keywords'] = df_mt['symptoms'].apply(preprocess_text)

In [None]:
df_mt.head(5)

In [None]:
#Setup text vectorization with custom variables
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75,min_df=5, use_idf=True, smooth_idf=True,sublinear_tf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(df['clean_text'].tolist() )
feature_names = sorted(vectorizer.get_feature_names_out())
print(feature_names)

In [None]:
#Setup PCA model
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())
labels = df['label'].tolist()
del feature_names[0:35]
category_list = df.label.unique()

In [None]:
#Data is then split into training and validation sets using the train_test_split function
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, labels, stratify=labels,random_state=1)
print('Train_Set_Size:'+str(X_train.shape))
print('Test_Set_Size:'+str(X_test.shape))

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

label2id = {label: int(idx) for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}
id2label = {int(idx): label for label, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['text', 'label_id']].rename(columns={'label_id': 'label'}))
dataset = dataset.train_test_split(test_size=0.2)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# **Evaluations**

In [None]:
#checking column/row items
df.sample(1)

In [None]:
#checking column/row items
df_mt.sample(1)

In [None]:
#Import Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# KNN Model Training

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) based on your dataset
knn_classifier.fit(X_train, y_train)

In [None]:
# Predictions

predictions = knn_classifier.predict(X_test)

# **Model Evaluation**

In [None]:
# Model Evaluation

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, predictions))

# **Confusion Matrix**

In [None]:
#Import Modules
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Plotting confusion matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens', xticklabels=df['label'].unique(), yticklabels=df['label'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## **Example #1**

In [None]:
#Import Modules
from sklearn.feature_extraction.text import TfidfVectorizer

# Example Usage
example_symptom_1 = df_mt['clean_keywords'][16]

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(example_symptom_1)

# Transform the preprocessed symptom using the same vectorizer used during training
#symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(tfIdfMat_reduced)
actual_disease = df_mt['disease'][16]

# Print the results
print(f'Symptoms: {example_symptom_1}')
print(f'Predicted Disease: {predicted_disease[0]}')
print(f'Actual Disease: {actual_disease}')

## **Example #2**

In [None]:
#Import Modules
from sklearn.feature_extraction.text import TfidfVectorizer

# Example Usage
example_symptom_1 = "high fever"

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(example_symptom_1)

# Transform the preprocessed symptom using the same vectorizer used during training
#symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(tfIdfMat_reduced)
#actual_disease = df_mt['disease'][16]

# Print the results
print(f'Symptoms: {example_symptom_1}')
print(f'Predicted Disease: {predicted_disease[105]}')
#print(f'Actual Disease: {actual_disease}')