In [2]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')


In [3]:
print("🌿 AyurJanani Ayurvedic ML Models Training")
print("=" * 50)


🌿 AyurJanani Ayurvedic ML Models Training


In [4]:
symptom_training_data = [
    # Digestive issues
    ("I feel bloated and have stomach pain", ["digestive", "abdominal_pain"]),
    ("Nausea and vomiting in the morning", ["digestive", "morning_sickness"]),
    ("I have heartburn and acid reflux", ["digestive", "heartburn"]),
    ("Constipation and difficulty passing stool", ["digestive", "constipation"]),
    ("I feel very hungry all the time", ["digestive", "increased_appetite"]),
    ("Loss of appetite and food aversion", ["digestive", "loss_of_appetite"]),
    
    # Musculoskeletal
    ("Lower back pain and stiffness", ["musculoskeletal", "back_pain"]),
    ("Joint pain in hands and feet", ["musculoskeletal", "joint_pain"]),
    ("Leg cramps during night", ["musculoskeletal", "leg_cramps"]),
    ("Hip pain when walking", ["musculoskeletal", "hip_pain"]),
    ("Neck and shoulder tension", ["musculoskeletal", "neck_pain"]),
    
    # Fatigue and energy
    ("I feel extremely tired all day", ["fatigue", "exhaustion"]),
    ("No energy to do daily activities", ["fatigue", "low_energy"]),
    ("Feeling weak and dizzy", ["fatigue", "weakness", "dizziness"]),
    ("Need to rest frequently", ["fatigue", "need_rest"]),
    
    # Sleep issues
    ("Cannot fall asleep at night", ["sleep", "insomnia"]),
    ("Waking up multiple times", ["sleep", "interrupted_sleep"]),
    ("Very restless sleep", ["sleep", "restless_sleep"]),
    ("Feeling tired after sleep", ["sleep", "unrefreshing_sleep"]),
    
    # Emotional/Mental
    ("Feeling anxious and worried", ["emotional", "anxiety"]),
    ("Mood swings and irritability", ["emotional", "mood_swings"]),
    ("Feeling sad and emotional", ["emotional", "depression"]),
    ("Stress and tension", ["emotional", "stress"]),
    
    # Circulatory
    ("Swelling in hands and feet", ["circulatory", "edema"]),
    ("Feeling dizzy when standing", ["circulatory", "dizziness"]),
    ("Cold hands and feet", ["circulatory", "cold_extremities"]),
    ("Varicose veins in legs", ["circulatory", "varicose_veins"]),
    
    # Respiratory
    ("Shortness of breath", ["respiratory", "breathlessness"]),
    ("Cough and throat irritation", ["respiratory", "cough"]),
    ("Chest tightness", ["respiratory", "chest_tightness"]),
    
    # Urinary
    ("Frequent urination", ["urinary", "frequent_urination"]),
    ("Burning sensation while urinating", ["urinary", "urinary_burning"]),
    ("Urgency to urinate", ["urinary", "urinary_urgency"]),
    
    # Skin issues
    ("Dry and itchy skin", ["skin", "dry_skin"]),
    ("Dark patches on skin", ["skin", "pigmentation"]),
    ("Stretch marks appearing", ["skin", "stretch_marks"]),
    
    # Vata imbalance symptoms
    ("Dry skin and constipation", ["vata_imbalance", "digestive", "skin"]),
    ("Anxiety and irregular sleep", ["vata_imbalance", "emotional", "sleep"]),
    ("Joint pain and gas", ["vata_imbalance", "musculoskeletal", "digestive"]),
    
    # Pitta imbalance symptoms
    ("Heartburn and irritability", ["pitta_imbalance", "digestive", "emotional"]),
    ("Excessive heat and sweating", ["pitta_imbalance", "heat_symptoms"]),
    ("Skin rashes and acidity", ["pitta_imbalance", "skin", "digestive"]),
    
    # Kapha imbalance symptoms
    ("Swelling and weight gain", ["kapha_imbalance", "circulatory", "weight_gain"]),
    ("Sluggishness and mucus", ["kapha_imbalance", "fatigue", "respiratory"]),
    ("Heavy feeling and congestion", ["kapha_imbalance", "respiratory", "fatigue"])
]


In [5]:
texts = [item[0] for item in symptom_training_data]
labels = [item[1] for item in symptom_training_data]


In [6]:
mlb = MultiLabelBinarizer()
y_labels = mlb.fit_transform(labels)


In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(texts)


In [14]:
symptom_classifier = OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=1000))
symptom_classifier.fit(X_tfidf, y_labels)


In [17]:
import random
import re
from collections import Counter
class SymptomClassifier:
    def __init__(self, classifier, vectorizer, label_binarizer, training_data=None):
        self.classifier = classifier
        self.vectorizer = vectorizer
        self.label_binarizer = label_binarizer
        self.training_data = training_data  # List of (text, labels)

    def _tokenize(self, text):
        # Simple word tokenizer, lowercase
        return set(re.findall(r'\w+', text.lower()))

    def predict(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        X = self.vectorizer.transform(texts)
        predictions = self.classifier.predict(X)
        results = []
        for i, pred in enumerate(predictions):
            labels = self.label_binarizer.inverse_transform(pred.reshape(1, -1))[0]
            if not labels:
                # Fallback: match to training data by token overlap
                input_text = texts[i].lower()
                input_tokens = self._tokenize(input_text)
                best_matches = []
                max_overlap = 0
                for train_text, train_labels in (self.training_data or []):
                    train_tokens = self._tokenize(train_text)
                    overlap = len(input_tokens & train_tokens)
                    if overlap > max_overlap:
                        best_matches = [train_labels]
                        max_overlap = overlap
                    elif overlap == max_overlap and overlap > 0:
                        best_matches.append(train_labels)
                # Flatten and deduplicate
                matched_labels = set()
                for label_list in best_matches:
                    matched_labels.update(label_list)
                # If no match, fallback to substring/phrase matching as before
                if not matched_labels:
                    all_labels = set(self.label_binarizer.classes_)
                    matched = []
                    for label in all_labels:
                        label_clean = label.lower().replace('_', ' ')
                        if label_clean in input_text or label.lower() in input_text:
                            matched.append(label)
                    imbalance_labels = [l for l in matched if 'imbalance' in l]
                    if imbalance_labels:
                        labels = tuple(sorted(set(imbalance_labels + matched)))
                    elif matched:
                        n = min(len(matched), 2)
                        labels = tuple(sorted(random.sample(matched, n)))
                    else:
                        labels = ()
                else:
                    # Always include imbalance categories if present
                    imbalance_labels = [l for l in matched_labels if 'imbalance' in l]
                    if imbalance_labels:
                        labels = tuple(sorted(set(imbalance_labels + matched_labels)))
                    else:
                        labels = tuple(sorted(matched_labels))
            results.append(labels)
        return results

    def predict_proba(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        X = self.vectorizer.transform(texts)
        return self.classifier.predict_proba(X)

# Create and save symptom classifier
symptom_classifier_model = SymptomClassifier(symptom_classifier, tfidf_vectorizer, mlb, training_data=symptom_training_data)
joblib.dump(symptom_classifier_model, 'symptom_classifier_model.pkl')
print("✅ Symptom Classifier Model saved (improved fallback)")


✅ Symptom Classifier Model saved (improved fallback)


In [10]:
print("\n🚨 Training Symptom Risk Model...")

# Sample training data for risk prediction
risk_training_data = [
    # Anemia risk patterns
    ({"symptoms": ["fatigue", "weakness", "dizziness"], "systolic_bp": 110, "diastolic_bp": 70, "heart_rate": 95}, ["anemia"]),
    ({"symptoms": ["fatigue", "cold_extremities", "weakness"], "systolic_bp": 105, "diastolic_bp": 65, "heart_rate": 100}, ["anemia"]),
    ({"symptoms": ["exhaustion", "dizziness", "weakness"], "systolic_bp": 115, "diastolic_bp": 75, "heart_rate": 90}, ["anemia"]),
    
    # Gestational Diabetes risk
    ({"symptoms": ["increased_appetite", "frequent_urination", "fatigue"], "systolic_bp": 130, "diastolic_bp": 85, "blood_glucose": 140}, ["gestational_diabetes"]),
    ({"symptoms": ["excessive_thirst", "frequent_urination"], "systolic_bp": 125, "diastolic_bp": 80, "blood_glucose": 150}, ["gestational_diabetes"]),
    
    # Preeclampsia risk
    ({"symptoms": ["edema", "dizziness", "headache"], "systolic_bp": 145, "diastolic_bp": 95, "heart_rate": 85}, ["preeclampsia"]),
    ({"symptoms": ["swelling", "vision_changes", "headache"], "systolic_bp": 150, "diastolic_bp": 100, "heart_rate": 80}, ["preeclampsia"]),
    
    # UTI risk
    ({"symptoms": ["urinary_burning", "frequent_urination", "urgency"], "systolic_bp": 120, "diastolic_bp": 80, "body_temp": 37.8}, ["uti"]),
    ({"symptoms": ["urinary_burning", "lower_abdominal_pain"], "systolic_bp": 115, "diastolic_bp": 75, "body_temp": 38.0}, ["uti"]),
    
    # Preterm labor risk
    ({"symptoms": ["abdominal_pain", "back_pain", "contractions"], "systolic_bp": 125, "diastolic_bp": 85, "heart_rate": 95}, ["preterm_labor"]),
    ({"symptoms": ["pelvic_pressure", "cramping", "back_pain"], "systolic_bp": 120, "diastolic_bp": 80, "heart_rate": 90}, ["preterm_labor"]),
    
    # Multiple risks
    ({"symptoms": ["fatigue", "edema", "headache"], "systolic_bp": 140, "diastolic_bp": 90, "heart_rate": 85}, ["anemia", "preeclampsia"]),
    ({"symptoms": ["weakness", "frequent_urination", "fatigue"], "systolic_bp": 135, "diastolic_bp": 85, "blood_glucose": 145}, ["anemia", "gestational_diabetes"]),
    
    # Normal/Low risk cases
    ({"symptoms": ["mild_fatigue"], "systolic_bp": 120, "diastolic_bp": 80, "heart_rate": 75}, ["low_risk"]),
    ({"symptoms": ["back_pain"], "systolic_bp": 115, "diastolic_bp": 75, "heart_rate": 70}, ["low_risk"]),
    ({"symptoms": ["morning_sickness"], "systolic_bp": 110, "diastolic_bp": 70, "heart_rate": 80}, ["low_risk"])
]



🚨 Training Symptom Risk Model...


In [11]:
def prepare_risk_features(data_list):
    features = []
    labels = []
    
    for feature_dict, risk_labels in data_list:
        # Create feature vector
        feature_vector = {
            'systolic_bp': feature_dict.get('systolic_bp', 120),
            'diastolic_bp': feature_dict.get('diastolic_bp', 80),
            'blood_glucose': feature_dict.get('blood_glucose', 90),
            'body_temp': feature_dict.get('body_temp', 36.8),
            'heart_rate': feature_dict.get('heart_rate', 75)
        }
        
        # Add symptom features (multi-hot encoding)
        symptoms = feature_dict.get('symptoms', [])
        for symptom in symptoms:
            feature_vector[f'symptom_{symptom}'] = 1
        
        features.append(feature_vector)
        labels.append(risk_labels)
    
    return features, labels

risk_features, risk_labels = prepare_risk_features(risk_training_data)

# Create feature vectorizer for risk model
risk_vectorizer = DictVectorizer(sparse=False)
X_risk = risk_vectorizer.fit_transform(risk_features)

# Create label binarizer for risks
risk_mlb = MultiLabelBinarizer()
y_risk = risk_mlb.fit_transform(risk_labels)

# Train risk model
risk_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
risk_model.fit(X_risk, y_risk)


In [12]:
class SymptomRiskModel:
    def __init__(self, model, vectorizer, label_binarizer):
        self.model = model
        self.vectorizer = vectorizer
        self.label_binarizer = label_binarizer
    
    def predict(self, feature_dicts):
        if isinstance(feature_dicts, dict):
            feature_dicts = [feature_dicts]
        X = self.vectorizer.transform(feature_dicts)
        predictions = self.model.predict(X)
        return [self.label_binarizer.inverse_transform(pred.reshape(1, -1))[0] for pred in predictions]
    
    def predict_proba(self, feature_dicts):
        if isinstance(feature_dicts, dict):
            feature_dicts = [feature_dicts]
        X = self.vectorizer.transform(feature_dicts)
        probabilities = self.model.predict_proba(X)
        return probabilities


In [13]:
symptom_risk_model = SymptomRiskModel(risk_model, risk_vectorizer, risk_mlb)
joblib.dump(symptom_risk_model, 'symptom_risk_model.pkl')
print("✅ Symptom Risk Model saved")



✅ Symptom Risk Model saved


In [24]:
remedy_training_data = [
    # Digestive issues
    ({"symptoms": ["morning_sickness", "nausea"], "prakriti": "vata"}, 
     ["Ginger tea with honey - Take 1 cup in the morning before breakfast",
      "Small frequent meals every 2-3 hours to prevent empty stomach"]),
    
    ({"symptoms": ["constipation", "bloating"], "prakriti": "vata"}, 
     ["Warm water with lemon first thing in the morning",
      "Gentle abdominal massage with warm sesame oil in clockwise direction"]),
    
    ({"symptoms": ["heartburn", "acidity"], "prakriti": "pitta"}, 
     ["Coconut water twice daily to cool the system",
      "Avoid spicy and sour foods, eat cooling foods like cucumber"]),
    
    # Musculoskeletal issues  
    ({"symptoms": ["back_pain", "joint_pain"], "prakriti": "vata"}, 
     ["Warm sesame oil massage on affected areas before bed",
      "Gentle prenatal yoga poses for 15-20 minutes daily"]),
    
    ({"symptoms": ["leg_cramps", "muscle_pain"], "prakriti": "vata"}, 
     ["Magnesium-rich foods like almonds and leafy greens",
      "Warm oil massage on legs before sleep"]),
    
    # Fatigue and weakness
    ({"symptoms": ["fatigue", "weakness"], "prakriti": "kapha"}, 
     ["Ashwagandha powder with warm milk before bed (consult doctor first)",
      "Iron-rich foods like spinach, dates, and pomegranate"]),
    
    ({"symptoms": ["exhaustion", "low_energy"], "prakriti": "vata"}, 
     ["Almonds soaked overnight (5-6 pieces) eaten in the morning",
      "Adequate rest and avoid overexertion"]),
    
    # Sleep issues
    ({"symptoms": ["insomnia", "restless_sleep"], "prakriti": "vata"}, 
     ["Warm milk with a pinch of nutmeg before bed",
      "Gentle foot massage with warm sesame oil"]),
    
    # Emotional issues
    ({"symptoms": ["anxiety", "stress"], "prakriti": "vata"}, 
     ["Deep breathing exercises for 10 minutes twice daily",
      "Brahmi tea in the evening (consult doctor first)"]),
    
    ({"symptoms": ["mood_swings", "irritability"], "prakriti": "pitta"}, 
     ["Rose water or rose tea to cool the mind",
      "Meditation and pranayama for 15 minutes daily"]),
    
    # Circulatory issues
    ({"symptoms": ["edema", "swelling"], "prakriti": "kapha"}, 
     ["Elevate legs while resting, avoid sitting for long periods",
      "Reduce salt intake and drink adequate water"]),
    
    ({"symptoms": ["cold_extremities", "poor_circulation"], "prakriti": "vata"}, 
     ["Warm ginger tea twice daily",
      "Gentle hand and foot exercises to improve circulation"]),
    
    # Skin issues
    ({"symptoms": ["dry_skin", "itching"], "prakriti": "vata"}, 
     ["Coconut oil or sesame oil application after bath",
      "Drink warm water throughout the day"]),
    
    # Respiratory issues
    ({"symptoms": ["breathlessness", "chest_tightness"], "prakriti": "kapha"}, 
     ["Steam inhalation with eucalyptus oil (2-3 drops)",
      "Pranayama breathing exercises for better lung capacity"]),
    
    # Urinary issues
    ({"symptoms": ["frequent_urination", "urgency"], "prakriti": "pitta"}, 
     ["Cranberry juice (unsweetened) once daily",
      "Avoid excessive fluids before bedtime"]),
    
    # Combined dosha imbalances
    ({"symptoms": ["fatigue", "anxiety", "constipation"], "prakriti": "vata"}, 
     ["Warm sesame oil massage full body twice weekly",
      "Regular meal times with warm, cooked foods"]),
    
    ({"symptoms": ["heartburn", "irritability", "excessive_thirst"], "prakriti": "pitta"}, 
     ["Coconut water and cooling foods like melon",
      "Avoid hot, spicy, and sour foods completely"]),
    
    ({"symptoms": ["swelling", "sluggishness", "mucus"], "prakriti": "kapha"}, 
     ["Warm water with honey and lemon in the morning",
      "Light exercises like walking for 30 minutes daily"])
]


In [25]:
def prepare_remedy_features(data_list):
    features = []
    remedies = []
    
    for feature_dict, remedy_list in data_list:
        # Create feature text combining symptoms and prakriti
        symptoms_text = " ".join(feature_dict.get('symptoms', []))
        prakriti_text = feature_dict.get('prakriti', 'balanced')
        combined_text = f"{symptoms_text} {prakriti_text}"
        
        features.append(combined_text)
        remedies.append(remedy_list)
    
    return features, remedies

remedy_features, remedy_labels = prepare_remedy_features(remedy_training_data)

# Create remedy vectorizer and model
remedy_tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
X_remedy = remedy_tfidf.fit_transform(remedy_features)


In [26]:
class RemedyRecommendationModel:
    def __init__(self, features, remedies, vectorizer):
        self.features = features
        self.remedies = remedies
        self.vectorizer = vectorizer
        self.feature_vectors = vectorizer.fit_transform(features)
        
    def predict(self, input_dict):
        """Predict remedies based on symptoms and prakriti"""
        symptoms_text = " ".join(input_dict.get('symptoms', []))
        prakriti_text = input_dict.get('prakriti', 'balanced')
        query_text = f"{symptoms_text} {prakriti_text}"
        
        # Transform query
        query_vector = self.vectorizer.transform([query_text])
        
        # Calculate similarity with all training examples
        from sklearn.metrics.pairwise import cosine_similarity
        similarities = cosine_similarity(query_vector, self.feature_vectors).flatten()
        
        # Get top matches
        top_indices = similarities.argsort()[-3:][::-1]  # Top 3 matches
        
        # Combine remedies from top matches
        all_remedies = []
        for idx in top_indices:
            if similarities[idx] > 0.1:  # Minimum similarity threshold
                all_remedies.extend(self.remedies[idx])
        
        # Remove duplicates while preserving order
        unique_remedies = []
        seen = set()
        for remedy in all_remedies:
            if remedy not in seen:
                unique_remedies.append(remedy)
                seen.add(remedy)
        
        return unique_remedies[:3]  # Return top 3 unique remedies
    
    def predict_with_confidence(self, input_dict):
        """Predict remedies with confidence scores"""
        remedies = self.predict(input_dict)
        # Assign confidence scores (in real implementation, this would be more sophisticated)
        confidences = [0.9, 0.8, 0.7]
        
        result = []
        for i, remedy in enumerate(remedies):
            conf = confidences[i] if i < len(confidences) else 0.6
            result.append({
                'remedy': remedy,
                'confidence': conf
            })
        
        return result

# Create and save remedy model
remedy_model = RemedyRecommendationModel(remedy_features, remedy_labels, remedy_tfidf)
joblib.dump(remedy_model, 'remedy_model.pkl')
print("✅ Remedy Model saved")


✅ Remedy Model saved


In [28]:
print("\n🧪 Testing Models...")

# Test symptom classifier
test_symptom = "I have severe back pain and feel very tired"
classified_symptoms = symptom_classifier_model.predict([test_symptom])[0]
print(f"Test symptom: '{test_symptom}'")
print(f"Classified as: {list(classified_symptoms)}")

# Test risk model  
test_risk_features = {
    'symptoms': ['fatigue', 'back_pain'],
    'systolic_bp': 140,
    'diastolic_bp': 90,
    'heart_rate': 85
}



🧪 Testing Models...
Test symptom: 'I have severe back pain and feel very tired'
Classified as: []


In [29]:
predicted_risks = symptom_risk_model.predict([test_risk_features])[0]
print(f"Risk features: {test_risk_features}")
print(f"Predicted risks: {list(predicted_risks)}")

# Test remedy model
test_remedy_input = {
    'symptoms': ['back_pain', 'fatigue'],
    'prakriti': 'vata'
}
recommended_remedies = remedy_model.predict_with_confidence(test_remedy_input)
print(f"Remedy input: {test_remedy_input}")
print("Recommended remedies:")
for remedy in recommended_remedies:
    print(f"  - {remedy['remedy']} (confidence: {remedy['confidence']:.2f})")

Risk features: {'symptoms': ['fatigue', 'back_pain'], 'systolic_bp': 140, 'diastolic_bp': 90, 'heart_rate': 85}
Predicted risks: []
Remedy input: {'symptoms': ['back_pain', 'fatigue'], 'prakriti': 'vata'}
Recommended remedies:
  - Warm sesame oil massage on affected areas before bed (confidence: 0.90)
  - Gentle prenatal yoga poses for 15-20 minutes daily (confidence: 0.80)
  - Warm sesame oil massage full body twice weekly (confidence: 0.70)


In [32]:


print("\n🎉 All models trained and saved successfully!")
print("   - symptom_classifier_model.pkl")
print("   - symptom_risk_model.pkl") 
print("   - remedy_model.pkl")

print("\n💡 Usage in Flask API:")
print("symptom_classifier = joblib.load('symptom_classifier_model.pkl')")
print("symptom_risk_model = joblib.load('symptom_risk_model.pkl')")
print("remedy_model = joblib.load('remedy_model.pkl')")


🎉 All models trained and saved successfully!
   - symptom_classifier_model.pkl
   - symptom_risk_model.pkl
   - remedy_model.pkl

💡 Usage in Flask API:
symptom_classifier = joblib.load('symptom_classifier_model.pkl')
symptom_risk_model = joblib.load('symptom_risk_model.pkl')
remedy_model = joblib.load('remedy_model.pkl')
