In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pickle
import re
import csv
import warnings
warnings.filterwarnings('ignore')

class MedicalPredictor:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=1000)
        self.age_encoder = LabelEncoder()
        self.classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        self.data = None
        
    def preprocess_text(self, text):
        """Clean and standardize text input"""
        if isinstance(text, str):
            text = text.lower().strip()
            text = re.sub(r'[^a-zA-Z0-9\s,]', '', text)
            return text
        return ''
    
    def load_training_data(self, filepath):
        """Load and preprocess the training data with robust CSV handling"""
        try:
            # Define expected columns
            expected_columns = [
                'Disorder/Condition Name', 'Category', 'Symptoms', 'Diagnosis Tests',
                'Causes/Risk Factors', 'Associated Risks/Complications', 'Age Group Affected',
                'Prevalence', 'Medications', 'Treatment Recommendations', 'Therapies/Procedures',
                'Recovery Outlook/Prognosis', 'Food Intake Recommendations', 'Foods to Avoid',
                'Lifestyle Recommendations', 'Psychological Effects', 'Support Groups or Resources',
                'Duration of Symptoms', 'Recovery Time', 'Geographical Prevalence',
                'Common Misdiagnoses', 'Clinical Research Notes', 'Specialist Recommendations'
            ]
            
            # Read CSV file manually to handle inconsistent columns
            rows = []
            with open(filepath, 'r', encoding='utf-8') as f:
                csv_reader = csv.reader(f)
                header = next(csv_reader)  # Skip header row
                
                for row in csv_reader:
                    # Handle extra or fewer columns
                    if len(row) > len(expected_columns):
                        # Combine extra columns into the last expected column
                        fixed_row = row[:len(expected_columns)-1] + [', '.join(row[len(expected_columns)-1:])]
                    else:
                        # Pad with empty strings if row is too short
                        fixed_row = row + [''] * (len(expected_columns) - len(row))
                    rows.append(fixed_row)
            
            # Create DataFrame with fixed rows
            self.data = pd.DataFrame(rows, columns=expected_columns)
            
            # Fill missing values
            self.data = self.data.fillna("Information not available")
            
            # Preprocess symptoms and create feature matrix
            X_symptoms = self.data['Symptoms'].apply(self.preprocess_text)
            X_symptoms_tfidf = self.tfidf.fit_transform(X_symptoms)
            
            # Process age groups
            self.data['Age Group Processed'] = self.data['Age Group Affected'].fillna('unknown')
            X_age = self.age_encoder.fit_transform(self.data['Age Group Processed'])
            
            # Combine features
            X = np.hstack((X_symptoms_tfidf.toarray(), X_age.reshape(-1, 1)))
            y = self.data['Disorder/Condition Name']
            
            # Train the model
            self.classifier.fit(X, y)
            
            return True
            
        except Exception as e:
            print(f"Error loading training data: {str(e)}")
            return False

    def save_model(self, filename='medical_model.pkl'):
        """Save the trained model and associated data"""
        model_data = {
            'classifier': self.classifier,
            'tfidf': self.tfidf,
            'age_encoder': self.age_encoder,
            'data': self.data
        }
        with open(filename, 'wb') as f:
            pickle.dump(model_data, f)

    def load_model(self, filename='medical_model.pkl'):
        """Load a previously saved model"""
        with open(filename, 'rb') as f:
            model_data = pickle.load(f)
        self.classifier = model_data['classifier']
        self.tfidf = model_data['tfidf']
        self.age_encoder = model_data['age_encoder']
        self.data = model_data['data']

    def answer_question(self, question):
        """Answer specific questions about medical conditions"""
        question = question.lower()
        
        # Find condition name in question
        condition = None
        for idx, row in self.data.iterrows():
            if row['Disorder/Condition Name'].lower() in question:
                condition = row
                break
        
        if condition is None:
            return "I couldn't identify a specific condition in your question."
        
        # Pattern match different question types
        if 'symptoms' in question:
            return f"The common symptoms of {condition['Disorder/Condition Name']} include: {condition['Symptoms']}"
        elif 'test' in question or 'diagnos' in question:
            return f"Diagnostic tests for {condition['Disorder/Condition Name']} include: {condition['Diagnosis Tests']}"
        elif 'treat' in question or 'therapy' in question:
            return (f"Treatment for {condition['Disorder/Condition Name']} includes:\n"
                   f"- Treatment Recommendations: {condition['Treatment Recommendations']}\n"
                   f"- Medications: {condition['Medications']}\n"
                   f"- Therapies/Procedures: {condition['Therapies/Procedures']}")
        else:
            return (f"Information about {condition['Disorder/Condition Name']}:\n"
                   f"Category: {condition['Category']}\n"
                   f"Symptoms: {condition['Symptoms']}\n"
                   f"Diagnosis: {condition['Diagnosis Tests']}\n"
                   f"Treatment: {condition['Treatment Recommendations']}\n"
                   f"Medications: {condition['Medications']}\n"
                   f"Causes/Risk Factors: {condition['Causes/Risk Factors']}")

    def predict_condition(self, symptoms, age_group):
        """Predict condition based on symptoms and age group"""
        try:
            # Preprocess symptoms
            processed_symptoms = self.preprocess_text(symptoms)
            symptoms_vector = self.tfidf.transform([processed_symptoms])
            
            # Process age group
            try:
                age_encoded = self.age_encoder.transform([age_group])
            except ValueError:
                # Handle unknown age groups
                closest_match = self.find_closest_age_group(age_group)
                age_encoded = self.age_encoder.transform([closest_match])
            
            # Combine features and predict
            X_pred = np.hstack((symptoms_vector.toarray(), age_encoded.reshape(-1, 1)))
            predicted_condition = self.classifier.predict(X_pred)[0]
            
            # Get full condition information
            condition_info = self.data[self.data['Disorder/Condition Name'] == predicted_condition].iloc[0]
            
            return {
                'Predicted Condition': condition_info['Disorder/Condition Name'],
                'Category': condition_info['Category'],
                'Typical Symptoms': condition_info['Symptoms'],
                'Diagnosis Tests': condition_info['Diagnosis Tests'],
                'Treatment Recommendations': condition_info['Treatment Recommendations'],
                'Medications': condition_info['Medications'],
                'Therapies/Procedures': condition_info['Therapies/Procedures'],
                'Food Recommendations': condition_info['Food Intake Recommendations'],
                'Foods to Avoid': condition_info['Foods to Avoid'],
                'Lifestyle Recommendations': condition_info['Lifestyle Recommendations'],
                'Prognosis': condition_info['Recovery Outlook/Prognosis'],
                'Causes/Risk Factors': condition_info['Causes/Risk Factors'],
                'Duration': condition_info['Duration of Symptoms'],
                'Recovery Time': condition_info['Recovery Time']
            }
            
        except Exception as e:
            return {'error': f"Prediction error: {str(e)}"}

    def find_closest_age_group(self, age_group):
        """Find the closest matching age group in the training data"""
        known_age_groups = self.data['Age Group Processed'].unique()
        return known_age_groups[0]  # Return first age group as default

def main():
    predictor = MedicalPredictor()
    
    # Load and train the model
    print("Loading and training model...")
    success = predictor.load_training_data('C:/Users/Badari/Downloads/final_student - Copy/final_student/data_set/own data.csv')  # Update with your CSV filename
    if not success:
        return
    
    # Save the model for future use
    predictor.save_model()
    
    # Test question answering
    print("\nTesting question answering:")
    questions = [
        "What are the symptoms of Dysmenorrhea?",
        "What tests are used to diagnose PCOS?",
        "How is Amenorrhea treated?"
    ]
    
    for question in questions:
        print(f"\nQuestion: {question}")
        answer = predictor.answer_question(question)
        print(f"Answer: {answer}")
    
    # Test condition prediction
    print("\nTesting condition prediction:")
    test_cases = [
        {
            "symptoms": "Cramping pain in lower abdomen, Nausea, Vomiting, Fatigue, Headaches",
            "age_group": "13-45 years"
        },
        {
            "symptoms": "Irregular periods, Weight gain, Acne, Hair growth",
            "age_group": "15-35 years"
        }
    ]
    
    for case in test_cases:
        print(f"\nSymptoms: {case['symptoms']}")
        print(f"Age Group: {case['age_group']}")
        result = predictor.predict_condition(case['symptoms'], case['age_group'])
        print("\nPredicted Condition and Details:")
        for key, value in result.items():
            print(f"{key}: {value}")

main()

Loading and training model...

Testing question answering:

Question: What are the symptoms of Dysmenorrhea?
Answer: The common symptoms of Dysmenorrhea include: Cramping pain in lower abdomen, Radiating pain to lower back and thighs, Nausea, Vomiting, Diarrhea, Fatigue, Headaches

Question: What tests are used to diagnose PCOS?
Answer: I couldn't identify a specific condition in your question.

Question: How is Amenorrhea treated?
Answer: Treatment for Amenorrhea includes:
- Treatment Recommendations: Address underlying cause (e.g., weight management, exercise reduction, thyroid treatment), Hormonal treatments, Lifestyle modifications (e.g., stress management, weight normalization)
- Medications: Hormonal therapy (estrogen/progesterone replacement), Oral contraceptives, GnRH agonists, Dopamine agonists for hyperprolactinemia
- Therapies/Procedures: Nutritional therapy (balanced diet, adequate calorie intake), Cognitive behavioral therapy (CBT) for stress management

Testing condition 

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

class MedicalPredictor:
    def __init__(self):
        self.tfidf = None
        self.age_encoder = None
        self.classifier = None

    def preprocess_text(self, text):
        """Preprocess text data."""
        return text.lower().strip() if isinstance(text, str) else ""

    def load_model(self, model_filepath):
        """Load the pre-trained model and related objects."""
        try:
            with open(model_filepath, 'rb') as f:
                model_data = pickle.load(f)
                self.tfidf = model_data['tfidf']
                self.age_encoder = model_data['age_encoder']
                self.classifier = model_data['classifier']
            print("Model and encoders loaded successfully.")
        except Exception as e:
            print(f"Error loading model: {str(e)}")

    def calculate_accuracy(self, test_filepath):
        """Calculate the accuracy of the model on test data."""
        try:
            # Load test data with error handling
            test_data = pd.read_csv(test_filepath, on_bad_lines='skip')  # Skip problematic rows
            test_data.fillna("Information not available", inplace=True)

            # Preprocess symptoms
            X_symptoms = test_data['Symptoms'].apply(self.preprocess_text)
            X_symptoms_tfidf = self.tfidf.transform(X_symptoms)

            # Encode age groups
            test_data['Age Group Processed'] = test_data['Age Group Affected'].fillna('unknown')
            X_age = self.age_encoder.transform(test_data['Age Group Processed'])

            # Combine features
            X_test = np.hstack((X_symptoms_tfidf.toarray(), X_age.reshape(-1, 1)))
            y_test = test_data['Disorder/Condition Name']

            # Predict using the model
            y_pred = self.classifier.predict(X_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            print(f"Model Accuracy: {accuracy:.2%}")
        except Exception as e:
            print(f"Error calculating accuracy: {str(e)}")

# Usage
if __name__ == "__main__":
    # Initialize the MedicalPredictor
    predictor = MedicalPredictor()

    # Load the saved model
    model_filepath = 'medical_model.pkl'  # Replace with the actual path to your saved model
    predictor.load_model(model_filepath)

    # Specify the path to the test file
    test_file_path = 'C:/Users/Badari/Downloads/final_student - Copy/final_student/data_set/own data.csv'  # Replace with the correct test file path

    # Calculate accuracy
    predictor.calculate_accuracy(test_file_path)


Model and encoders loaded successfully.
Model Accuracy: 100.00%
