Step 1: Load the Dataset

In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sanjanagowda76gh","key":"be302fa469e05161077bfbe41e3a3482"}'}

In [None]:
import os

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Secure permissions


In [None]:
!pip install kaggle
!kaggle datasets download -d raghavdecoded/ayurvedic-formulations-and-their-indications --unzip


Dataset URL: https://www.kaggle.com/datasets/raghavdecoded/ayurvedic-formulations-and-their-indications
License(s): apache-2.0
Downloading ayurvedic-formulations-and-their-indications.zip to /content
  0% 0.00/11.1k [00:00<?, ?B/s]
100% 11.1k/11.1k [00:00<00:00, 17.2MB/s]


In [None]:
import os

print("Files in current directory:", os.listdir("."))


Files in current directory: ['.config', 'ayurvedic_symptoms_desc.csv', 'Formulation-Indications.csv', 'FormulationClass.csv', 'sample_data']


In [None]:
import os

# List all files in the current directory
print("Files in current directory:", os.listdir("."))


Files in current directory: ['.config', 'ayurvedic_symptoms_desc.csv', 'Formulation-Indications.csv', 'FormulationClass.csv', 'sample_data']


In [None]:
import pandas as pd

# Load the dataset with the correct filename
dataset_path = "ayurvedic_symptoms_desc.csv"  # Ensure this matches your file name
df = pd.read_csv(dataset_path)

# Display first few rows
print(df.head())


       Symptom                           Description
0   Vatavikara  Disorders related to the Vata dosha.
1    Netraroga                        Eye disorders.
2  Malavarodha                         Constipation.
3  Sutikadosha                 Postpartum disorders.
4        Vrana                   Wounds or injuries.


Step 2: Data Cleaning & Preprocessing:
1️⃣ Convert text to lowercase (for consistency).
2️⃣ Remove special characters & numbers (to keep only meaningful words).
3️⃣ Remove extra spaces (to clean up the text).
4️⃣ Tokenize the text (split it into individual words).
5️⃣ Remove stopwords (to remove common words like "the", "is", etc.).

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLP resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
dataset_path = "ayurvedic_symptoms_desc.csv"  # Ensure correct filename
df = pd.read_csv(dataset_path)

# Display first few rows
print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


       Symptom                           Description
0   Vatavikara  Disorders related to the Vata dosha.
1    Netraroga                        Eye disorders.
2  Malavarodha                         Constipation.
3  Sutikadosha                 Postpartum disorders.
4        Vrana                   Wounds or injuries.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text


In [None]:
# Apply text cleaning to relevant columns
df['Cleaned_Symptom'] = df['Symptom'].apply(clean_text)
df['Cleaned_Description'] = df['Description'].apply(clean_text)

# Display cleaned data
print(df[['Cleaned_Symptom', 'Cleaned_Description']].head())


  Cleaned_Symptom                  Cleaned_Description
0      vatavikara  disorders related to the vata dosha
1       netraroga                        eye disorders
2     malavarodha                         constipation
3     sutikadosha                 postpartum disorders
4           vrana                   wounds or injuries


In [None]:
import nltk

# Download necessary NLP resources
nltk.download('punkt')      # For tokenization
nltk.download('stopwords')  # For stopwords

print("NLTK resources downloaded successfully!")


NLTK resources downloaded successfully!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import nltk

# Reinstall NLTK and download necessary resources
!pip install --upgrade nltk  # Upgrade NLTK in case of version issues
nltk.download('punkt')        # Tokenizer model
nltk.download('stopwords')    # Stopwords
nltk.download('wordnet')      # WordNet for lemmatization (optional)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
!pip install spacy  # Install SpaCy
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

# Function for tokenization using SpaCy
def tokenize_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]  # Remove stopwords

# Apply tokenization
df['Tokenized_Symptom'] = df['Cleaned_Symptom'].apply(tokenize_spacy)
df['Tokenized_Description'] = df['Cleaned_Description'].apply(tokenize_spacy)

print(df[['Tokenized_Symptom', 'Tokenized_Description']].head())


  Tokenized_Symptom              Tokenized_Description
0      [vatavikara]  [disorders, related, vata, dosha]
1       [netraroga]                   [eye, disorders]
2     [malavarodha]                     [constipation]
3     [sutikadosha]            [postpartum, disorders]
4           [vrana]                 [wounds, injuries]


Named Entity Recognition (NER)

In [None]:
import spacy

# Load the English NLP model with pre-trained NER
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities
def extract_entities(text):
    doc = nlp(" ".join(text))  # Convert list to string before processing
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply NER to tokenized descriptions
df['Named_Entities'] = df['Tokenized_Description'].apply(extract_entities)

# Display results
print(df[['Tokenized_Description', 'Named_Entities']].head())


               Tokenized_Description Named_Entities
0  [disorders, related, vata, dosha]             []
1                   [eye, disorders]             []
2                     [constipation]             []
3            [postpartum, disorders]             []
4                 [wounds, injuries]             []


Text Classification (Categorizing Symptoms)


In [None]:
# Define symptom categories based on Ayurveda principles
def categorize_symptom(symptom):
    vata_keywords = ['vat', 'pain', 'dry', 'spasm', 'nerve']
    pitta_keywords = ['inflammation', 'heat', 'burn', 'anger', 'ulcer']
    kapha_keywords = ['cold', 'mucus', 'swelling', 'obesity', 'congestion']

    symptom_str = " ".join(symptom).lower()  # Convert list to string and lowercase

    if any(word in symptom_str for word in vata_keywords):
        return 'Vata'
    elif any(word in symptom_str for word in pitta_keywords):
        return 'Pitta'
    elif any(word in symptom_str for word in kapha_keywords):
        return 'Kapha'
    else:
        return 'Unknown'  # If no match, label as Unknown

# Apply function to categorize each symptom
df['Category'] = df['Tokenized_Description'].apply(categorize_symptom)

# Print categorized data
print(df[['Tokenized_Description', 'Category']].head())


               Tokenized_Description Category
0  [disorders, related, vata, dosha]     Vata
1                   [eye, disorders]  Unknown
2                     [constipation]  Unknown
3            [postpartum, disorders]  Unknown
4                 [wounds, injuries]  Unknown


In [None]:
# Create an ontology dictionary for mapping
ontology_dict = {
    "vatavikara": "Vata Disorder",
    "netraroga": "Eye Disorder",
    "malavarodha": "Constipation",
    "sutikadosha": "Postpartum Disorder",
    "vrana": "Wound/Injury"
}

# Apply mapping to standardize symptom names
df['Standardized_Symptom'] = df['Tokenized_Symptom'].apply(lambda x: ontology_dict.get(x[0], x[0]))

print(df[['Tokenized_Symptom', 'Standardized_Symptom']].head())


  Tokenized_Symptom Standardized_Symptom
0      [vatavikara]        Vata Disorder
1       [netraroga]         Eye Disorder
2     [malavarodha]         Constipation
3     [sutikadosha]  Postpartum Disorder
4           [vrana]         Wound/Injury


Convert Text to Numerical Features Using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert standardized symptoms into string format
df['Standardized_Symptom'] = df['Standardized_Symptom'].astype(str)

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Standardized_Symptom'])

# Convert to DataFrame for better understanding
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display TF-IDF transformed data
print(tfidf_df.head())


   abhighatajavedanaandvatavikara  abhishyanda  adhamana  agnidagdha  \
0                             0.0          0.0       0.0         0.0   
1                             0.0          0.0       0.0         0.0   
2                             0.0          0.0       0.0         0.0   
3                             0.0          0.0       0.0         0.0   
4                             0.0          0.0       0.0         0.0   

   agnimandhya  agnimandya  agnimandyaudararoga  ajirna  aksepa  ama  ...  \
0          0.0         0.0                  0.0     0.0     0.0  0.0  ...   
1          0.0         0.0                  0.0     0.0     0.0  0.0  ...   
2          0.0         0.0                  0.0     0.0     0.0  0.0  ...   
3          0.0         0.0                  0.0     0.0     0.0  0.0  ...   
4          0.0         0.0                  0.0     0.0     0.0  0.0  ...   

   vranashotha  vriddhiroga  vrishya     wound  yakshma  yonibhransha  \
0          0.0          0.0    

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Example labels (modify based on your dataset)
df['Category'] = ['Vata', 'Pitta', 'Kapha', 'Vata', 'Kapha'] * (len(df) // 5)

# Encode categories into numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 40.00%


Ontology Mapping (Standardizing Ayurvedic Terms)