# 🧪 Ayurvedic Formulations from Symptoms

## 💽 Datasets
### Formulation - Indication/Symptoms
### Symptoms - Description
### Class - Name

# The Ayuvedic Formulation and Indication Dataset

In [1]:
import pandas as pd

df1 = pd.read_csv('/kaggle/input/ayurvedic-data/Formulation-Indications.csv')

In [2]:
df1.head()

Unnamed: 0,Name of Medicine,Reference text,Dispensing Pack Size,Main Indications,Dose,Precaution/ Contraindication,Preferred use (OPD/ IPD),Class
0,Abhayarishta,AFI,200 ml,"Arsha, Agnimandya,\nUdararoga, Vibandha",12 - 24 ml,NS,Both,A
1,Amritarishta,AFI,200 ml,"SarvaJvara, Jirna Jvara",12 - 24 ml,NS,Both,A
2,Aragvadharishta,AH,200ml,"Kandu, Tvak Vikara,\nVibandha",12 - 24 ml,NS,Both,A
3,Aravindasava,AFI,200 ml,"Balaroga, Balakshaya,\nAgnimandya, Aruchi",12 - 24 ml,NS,Both,A
4,Arjunarishta/ Parthadyarishta,AFI,200 ml,"Hridroga, Hriddrava, Hrid- daurbalya, Moha,\nM...",12 - 24 ml,NS,Both,A


## Making a list of all formulations

In [3]:
df1['Name of Medicine']

0                       Abhayarishta
1                       Amritarishta
2                    Aragvadharishta
3                       Aravindasava
4      Arjunarishta/ Parthadyarishta
                   ...              
197              Tribhuvankirti Rasa
198              Vatagajankusha Rasa
199             Vatavidhavansan Rasa
200                   Navayasa Lauha
201                 Saptamrita Lauha
Name: Name of Medicine, Length: 202, dtype: object

In [4]:
formulations_lst = list(df1['Name of Medicine'])

## Making a list of Main indications

In [5]:
df1['Main Indications']

0                Arsha, Agnimandya,\nUdararoga, Vibandha
1                                SarvaJvara, Jirna Jvara
2                          Kandu, Tvak Vikara,\nVibandha
3              Balaroga, Balakshaya,\nAgnimandya, Aruchi
4      Hridroga, Hriddrava, Hrid- daurbalya, Moha,\nM...
                             ...                        
197                             Jvara, Pratishyaya, Kasa
198    Vata Roga, Avabahuka, Urustambha, Pakshaghata,...
199                Vatajashula, Sutika Vata, Grahaniroga
200                             Pandu, Kamala,\nHridroga
201                                Timira, Drishtimandya
Name: Main Indications, Length: 202, dtype: object

In [6]:
original_list = list(df1['Main Indications'])
original_list[:5]

['Arsha, Agnimandya,\nUdararoga, Vibandha',
 'SarvaJvara, Jirna Jvara',
 'Kandu, Tvak Vikara,\nVibandha',
 'Balaroga, Balakshaya,\nAgnimandya, Aruchi',
 'Hridroga, Hriddrava, Hrid- daurbalya, Moha,\nMurchha']

In [7]:
processed_list = []

for item in original_list:
    # Remove spaces and newline characters, convert to lowercase
    processed_item = ''.join(item.split()).lower()
    processed_list.append(processed_item)

print(processed_list[:5])

['arsha,agnimandya,udararoga,vibandha', 'sarvajvara,jirnajvara', 'kandu,tvakvikara,vibandha', 'balaroga,balakshaya,agnimandya,aruchi', 'hridroga,hriddrava,hrid-daurbalya,moha,murchha']


## Finding all possible Unique Symptoms(Main Indications)

In [8]:
# List of lists of symptoms
list_of_symptoms = processed_list

# Flatten the list of lists and split the symptoms using commas and spaces
flat_symptoms = [symptom.replace(',', ' ').split() for symptoms in list_of_symptoms for symptom in symptoms.split(',')]

# Get unique symptoms as a list
unique_symptoms = list(set(symptom for sublist in flat_symptoms for symptom in sublist))

# Print the unique symptoms
print(unique_symptoms[:5])


['dushtavrana', 'drishtimandya', 'atisara', 'netravrana', 'aptantrak']


In [9]:
len(unique_symptoms)

300

# Making a Dataset from the Formulations & Main Indication lists

In [10]:
data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# Create a DataFrame
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,Formulation,Symptoms
0,Abhayarishta,"arsha,agnimandya,udararoga,vibandha"
1,Amritarishta,"sarvajvara,jirnajvara"
2,Aragvadharishta,"kandu,tvakvikara,vibandha"
3,Aravindasava,"balaroga,balakshaya,agnimandya,aruchi"
4,Arjunarishta/ Parthadyarishta,"hridroga,hriddrava,hrid-daurbalya,moha,murchha"
...,...,...
197,Tribhuvankirti Rasa,"jvara,pratishyaya,kasa"
198,Vatagajankusha Rasa,"vataroga,avabahuka,urustambha,pakshaghata,grid..."
199,Vatavidhavansan Rasa,"vatajashula,sutikavata,grahaniroga"
200,Navayasa Lauha,"pandu,kamala,hridroga"


# The Symptom-Description Dataset

In [12]:
symptoms = pd.read_csv('/kaggle/input/ayurvedic-data/ayurvedic_symptoms_desc.csv')

In [13]:
symptoms.head()

Unnamed: 0,Symptom,Description
0,Vatavikara,Disorders related to the Vata dosha.
1,Netraroga,Eye disorders.
2,Malavarodha,Constipation.
3,Sutikadosha,Postpartum disorders.
4,Vrana,Wounds or injuries.


In [14]:
symptoms['Symptom'] = symptoms['Symptom'].str.lower()

In [15]:
symptoms.head()

Unnamed: 0,Symptom,Description
0,vatavikara,Disorders related to the Vata dosha.
1,netraroga,Eye disorders.
2,malavarodha,Constipation.
3,sutikadosha,Postpartum disorders.
4,vrana,Wounds or injuries.


In [16]:
def symptoms_desc(symptom_name):
    row = symptoms[symptoms['Symptom'] == symptom_name.lower()]
#     print(row)
    if not row.empty:
        description = row.iloc[0]['Description']
        print(f'Description of "{symptom_name}": {description}')
    else:
        print(f'Symptom "{symptom_name}" not found in the DataFrame.')

In [17]:
def symptoms_lst_desc(user_symptoms):
    for item in user_symptoms:
#         print(item)
        symptoms_desc(item)

In [18]:
symptoms

Unnamed: 0,Symptom,Description
0,vatavikara,Disorders related to the Vata dosha.
1,netraroga,Eye disorders.
2,malavarodha,Constipation.
3,sutikadosha,Postpartum disorders.
4,vrana,Wounds or injuries.
...,...,...
295,galaganda,Goiter.
296,asrigdara,Menorrhagia.
297,pittajanetraroga,Urinary disorders due to Pitta dosha.
298,mutraroga,Urinary disorders.


In [19]:
unique_symptoms[:10]

['dushtavrana',
 'drishtimandya',
 'atisara',
 'netravrana',
 'aptantrak',
 'pramehapidika',
 'vishamajvara',
 'sthaulya',
 'hridya',
 'adhamana']

# 🔡 Spelling Correction

In [20]:
import difflib

# Your list of correct words
correct_words = unique_symptoms

def correct_spelling(input_word):
    # Find the closest match using difflib
    closest_match = difflib.get_close_matches(input_word, correct_words, n=1, cutoff=0.6)
    
    if closest_match:
        return closest_match[0]
    else:
        return input_word

user_input = input("Enter a word: ")
corrected_word = correct_spelling(user_input)

print(f"Did you mean: {corrected_word}")

Enter a word:  jvar


Did you mean: jvara


In [21]:
import difflib

# Your list of correct words (assuming you have a list called unique_symptoms)
correct_words = unique_symptoms

def correct_symptoms(symptoms):
    corrected_symptoms = []
    for symptom in symptoms:
        corrected_symptom = difflib.get_close_matches(symptom, correct_words, n=1, cutoff=0.6)
        if corrected_symptom:
            corrected_symptoms.append(corrected_symptom[0])
        else:
            corrected_symptoms.append(symptom)
    return corrected_symptoms


# Multinomial Naive Bayes Code

### There are unique labels. A given formulation has a given set of symptoms. If we split the data then we won't be able to predict for a given formulation. That's why it gives 0 accuracy.

### We can modify the data to Symptoms[binary] Formulation

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create a DataFrame
data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

df = pd.DataFrame(data)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Symptoms'], df['Formulation'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the symptom text data into numerical features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and train a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Generate a classification report using labels found in y_test
# report = classification_report(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)

print(f"Accuracy: {accuracy}")
# print(report)

# new_symptoms = ["jvara","kasa"]

# Spelling Correction
user_input = input("Enter a list of symptoms separated by spaces: ")
input_symptoms = user_input.split()
new_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(new_symptoms)}")

# Find Symptom Description
symptoms_lst_desc(new_symptoms)

# Predict Formulation 
new_symptoms_tfidf = tfidf_vectorizer.transform(new_symptoms)
predicted_label = clf.predict(new_symptoms_tfidf)
print(f"Predicted Formulation: {predicted_label[0]}")


Accuracy: 0.0


Enter a list of symptoms separated by spaces:  jvara kas


Did you mean: jvara, kasa
Description of "jvara": Fever.
Description of "kasa": Cough.
Predicted Formulation: Vyaghryadi Kashayam


### Using Machine Learning without Data Splitting

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the symptom text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Create and train a classifier (e.g., Naive Bayes)
clf = MultinomialNB()
clf.fit(X_tfidf, df['Formulation'])

# Spelling Correction
user_input = input("Enter a list of symptoms separated by spaces: ")
input_symptoms = user_input.split()
new_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(new_symptoms)}")

# Find Symptom Description
symptoms_lst_desc(new_symptoms)

# Predict Formulation 
new_symptoms_tfidf = tfidf_vectorizer.transform(new_symptoms)
predicted_label = clf.predict(new_symptoms_tfidf)
print(f"Predicted Formulation: {predicted_label[0]}")


Enter a list of symptoms separated by spaces:  jvara arsh


Did you mean: jvara, arsha
Description of "jvara": Fever.
Description of "arsha": Hemorrhoids.
Predicted Formulation: Punarnavadi Kashayam


## Cosine Similarity

Cosine similarity is used as a metric to determine the similarity between a user's symptoms and a set of formulations. Here's an explanation of how cosine similarity is applied:

### Calculation

1. The user's symptoms and the symptoms associated with each formulation are transformed into TF-IDF (Term Frequency-Inverse Document Frequency) vectors.

2. Cosine similarity is calculated between the user's TF-IDF vector and each formulation's TF-IDF vector.

3. Cosine similarity ranges from -1 (dissimilar) to 1 (similar), with 0 indicating no similarity.

### Finding the Closest Formulation

4. The formulation(s) with the highest cosine similarity score(s) are considered the closest match(es) to the user's symptoms.

5. If multiple formulations have the same highest similarity score, all of them are included in the result.

### Interpretation

Cosine similarity measures the cosine of the angle between two vectors, where a smaller angle represents higher similarity. In this context, it helps identify which formulation(s) most closely match the symptoms provided by the user.

### Applications

Cosine similarity is commonly used in information retrieval, recommendation systems, and text analysis, making it a useful tool for identifying relevant content or matches based on vectorized representations of data.


### Case 1: Finding a similar Formulation 
### Drawback: There is no counter case if an invalid input is provided

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# User symptoms
user_input = input("Enter a list of symptoms separated by spaces: ")
input_symptoms = user_input.split()
user_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(user_symptoms)}")

symptoms_lst_desc(user_symptoms)
user_symptoms_str = " ".join(user_symptoms)  # Convert user symptoms to a single string

# Create a DataFrame
df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the symptom text data into numerical features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Transform user symptoms into TF-IDF format
user_symptoms_tfidf = tfidf_vectorizer.transform([user_symptoms_str])

# Calculate cosine similarity between user's symptoms and all formulations
similarities = cosine_similarity(user_symptoms_tfidf, tfidf_matrix)

# Find all formulations with the highest similarity
closest_formulation_indices = [i for i, sim in enumerate(similarities[0]) if sim == max(similarities[0])]
closest_formulations = df.iloc[closest_formulation_indices]["Formulation"]

print("Closest Formulations:")
print(closest_formulations.tolist())


Enter a list of symptoms separated by spaces:  kas arsh


Did you mean: kasa, arsha
Description of "kasa": Cough.
Description of "arsha": Hemorrhoids.
Closest Formulations:
['Kasisadi Taila', 'Arsho Kuthara Rasa']


### Case 2: Finding the maximum similarity scores (inc. equal scores)
### Drawback: There can be very low similarity scores and the Formulation might not be usable in that case.

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# User symptoms
user_input = input("Enter a list of symptoms separated by spaces: ")
input_symptoms = user_input.split()
user_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(user_symptoms)}")

symptoms_lst_desc(user_symptoms)
user_symptoms_str = " ".join(user_symptoms)  # Convert user symptoms to a single string

# Create a DataFrame
df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the symptom text data into numerical features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Transform user symptoms into TF-IDF format
user_symptoms_tfidf = tfidf_vectorizer.transform([user_symptoms_str])

# Calculate cosine similarity between user's symptoms and all formulations
similarities = cosine_similarity(user_symptoms_tfidf, tfidf_matrix)

similarity_threshold = 0.5

# Find all formulations with similarity scores above the threshold
matching_indices = [i for i, sim in enumerate(similarities[0]) if sim > similarity_threshold]

if not matching_indices:
    print("No matching formulations found for the provided symptoms.")
else:
    # Sort formulations by similarity score in descending order
    sorted_indices = sorted(matching_indices, key=lambda i: similarities[0][i], reverse=True)
    
    # Retrieve all formulations with the top similarity scores
    top_similarity = similarities[0][sorted_indices[0]]
    top_similarity_indices = [i for i in sorted_indices if similarities[0][i] == top_similarity]
    closest_formulations = df.iloc[top_similarity_indices]["Formulation"]
    
    print("Closest Formulations with the Highest Similarity Scores:")
    print(closest_formulations.tolist())

Enter a list of symptoms separated by spaces:  jvara kasa shotha


Did you mean: jvara, kasa, shotha
Description of "jvara": Fever.
Description of "kasa": Cough.
Description of "shotha": Edema.
Closest Formulations with the Highest Similarity Scores:
['Punarnavadi Kashayam']


### Case 3: Using a threshold of similarity
### Drawback: There can be formulations which have lower similarity scores but still be used for the user symptoms

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = {
    "Formulation": formulations_lst,
    "Symptoms": processed_list,
}

# User symptoms
# user_symptoms = ["jvara","kasa", "shotha"]

user_input = input("Enter a list of symptoms separated by spaces: ")
input_symptoms = user_input.split()
user_symptoms = correct_symptoms(input_symptoms)
print(f"Did you mean: {', '.join(user_symptoms)}")

symptoms_lst_desc(user_symptoms)
user_symptoms_str = " ".join(user_symptoms)  # Convert user symptoms to a single string

# Create a DataFrame
df = pd.DataFrame(data)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the symptom text data into numerical features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Symptoms'])

# Transform user symptoms into TF-IDF format
user_symptoms_tfidf = tfidf_vectorizer.transform([user_symptoms_str])

# Calculate cosine similarity between user's symptoms and all formulations
similarities = cosine_similarity(user_symptoms_tfidf, tfidf_matrix)

# Set a threshold for similarity score (adjust as needed)
similarity_threshold = 0.5  # You can adjust this value

# Find all formulations with similarity scores above the threshold
matching_indices = [i for i, sim in enumerate(similarities[0]) if sim > similarity_threshold]

if not matching_indices:
    print("No matching formulations found for the provided symptoms.")
else:
    closest_formulations = df.iloc[matching_indices]["Formulation"]
    print("Closest Formulations:")
    print(closest_formulations.tolist())


Enter a list of symptoms separated by spaces:  jvara kasa pinasa


Did you mean: jvara, kasa, pinasa
Description of "jvara": Fever.
Description of "kasa": Cough.
Description of "pinasa": Sinusitis.
Closest Formulations:
['Chitraka Haritaki', 'Punarnavadi Kashayam', 'Vyaghryadi Kashayam']


# Printing details of the formulation (Suitable/Unsuitable)

In [27]:
df1.head()

Unnamed: 0,Name of Medicine,Reference text,Dispensing Pack Size,Main Indications,Dose,Precaution/ Contraindication,Preferred use (OPD/ IPD),Class
0,Abhayarishta,AFI,200 ml,"Arsha, Agnimandya,\nUdararoga, Vibandha",12 - 24 ml,NS,Both,A
1,Amritarishta,AFI,200 ml,"SarvaJvara, Jirna Jvara",12 - 24 ml,NS,Both,A
2,Aragvadharishta,AH,200ml,"Kandu, Tvak Vikara,\nVibandha",12 - 24 ml,NS,Both,A
3,Aravindasava,AFI,200 ml,"Balaroga, Balakshaya,\nAgnimandya, Aruchi",12 - 24 ml,NS,Both,A
4,Arjunarishta/ Parthadyarishta,AFI,200 ml,"Hridroga, Hriddrava, Hrid- daurbalya, Moha,\nM...",12 - 24 ml,NS,Both,A


In [28]:
r = df1.shape[0]

In [29]:
closest_formulations.tolist()

['Chitraka Haritaki', 'Punarnavadi Kashayam', 'Vyaghryadi Kashayam']

In [30]:
### Create a boolean mask to filter rows where the second column matches any element in closest_formulations
mask = df1.iloc[:, 0].isin(closest_formulations)

# Use the mask to select the rows that match the condition
filtered_df = df1[mask]

# Iterate through the filtered DataFrame and print each row separately
for index, row in filtered_df.iterrows():
    print(row)

Name of Medicine                                        Chitraka Haritaki
Reference text                                                        AFI
Dispensing Pack Size                                              100\ngm
Main Indications                                    Pinasa, Kasa, Shvasa,
Dose                                                           10 - 50 gm
Precaution/ Contraindication    Pregnancy, Pitta Prakriti, Paittika\nRoga
Preferred use (OPD/ IPD)                                             Both
Class                                                                   C
Name: 32, dtype: object
Name of Medicine                Punarnavadi Kashayam
Reference text                                   AFI
Dispensing Pack Size                         100\ngm
Main Indications                 Jvara, Kasa, Shotha
Dose                                         6-12 gm
Precaution/ Contraindication                      NS
Preferred use (OPD/ IPD)                        Both
Class        

In [31]:
for i in range(0,r):
    for ele in closest_formulations.tolist():
        if str(df1.iloc[i][0]) == ele:
            print(df1.iloc[i])

Name of Medicine                                        Chitraka Haritaki
Reference text                                                        AFI
Dispensing Pack Size                                              100\ngm
Main Indications                                    Pinasa, Kasa, Shvasa,
Dose                                                           10 - 50 gm
Precaution/ Contraindication    Pregnancy, Pitta Prakriti, Paittika\nRoga
Preferred use (OPD/ IPD)                                             Both
Class                                                                   C
Name: 32, dtype: object
Name of Medicine                Punarnavadi Kashayam
Reference text                                   AFI
Dispensing Pack Size                         100\ngm
Main Indications                 Jvara, Kasa, Shotha
Dose                                         6-12 gm
Precaution/ Contraindication                      NS
Preferred use (OPD/ IPD)                        Both
Class        

## Future
### Remove the unsuitable 