In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
import fitz  # PyMuPDF for PDF text extraction

# Load dataset
data = pd.read_csv('gynecological_conditions.csv', on_bad_lines='skip')


# Define input features and targets
X = data['Symptoms']  # Input features (Symptoms)

# Targets for prediction
targets = [
    'Disorder',
    'Treatment Recommendation',
    'Precautions',
    'Food Intake Recommendation',
    'Foods to Avoid',
    'Duration of Symptoms',
    'Lifestyle Recommendations'
]

# Encode the target columns
target_data = data[targets]
le_targets = {}
for target in targets:
    le_targets[target] = LabelEncoder()
    target_data[target] = le_targets[target].fit_transform(target_data[target])

# Split the dataset
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train = target_data.loc[X_train.index]
y_test = target_data.loc[X_test.index]

# Vectorize the symptoms data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Ensemble model setup
logreg = LogisticRegression()
svm = SVC(probability=True)  # Set probability=True for soft voting
rf = RandomForestClassifier()
voting_model = VotingClassifier(estimators=[
    ('logreg', logreg),
    ('svm', svm),
    ('rf', rf)
], voting='soft')

# Dictionary to store final ensemble models for each target
models = {}
ensemble_metrics = {}

# Train and evaluate ensemble models with metrics
for target in targets:
    voting_model.fit(X_train_vectorized, y_train[target])  # Train ensemble model for each target
    y_pred = voting_model.predict(X_test_vectorized)

    # Calculate metrics
    accuracy = accuracy_score(y_test[target], y_pred)
    precision = precision_score(y_test[target], y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test[target], y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test[target], y_pred, average='weighted', zero_division=1)

    # Store metrics
    ensemble_metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }
    models[target] = voting_model  # Store trained ensemble model

# Print ensemble model metrics
print("Ensemble Model Metrics:")
for target, metrics in ensemble_metrics.items():
    print(f"{target}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1_score']:.4f}")

# Save the trained ensemble models and vectorizer to files
with open('naive_bayes_treatment_model.pkl', 'wb') as model_file:
    pickle.dump(models, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Prediction function using the ensemble model
def predict_outputs(symptoms):
    # Vectorize the input symptoms
    input_vectorized = vectorizer.transform([symptoms])
    predictions = {}
    for target in targets:
        predicted_encoded = models[target].predict(input_vectorized)
        predictions[target] = le_targets[target].inverse_transform(predicted_encoded)[0]
    return predictions

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page in pdf_document:
            text += page.get_text()
    return text

# User input for PDF file
pdf_file_path = input("Please enter the path to the PDF file: ")

# Extract symptoms from the PDF
extracted_symptoms = extract_text_from_pdf(pdf_file_path)

# Predict outputs based on the extracted symptoms
predicted_outputs = predict_outputs(extracted_symptoms)

# Print the predicted outputs in the specified format
print(f"Based on extracted symptoms from '{pdf_file_path}':")
print(f"Disorder: {predicted_outputs['Disorder']}")
print(f"Treatment Recommendation: {predicted_outputs['Treatment Recommendation']}")
print(f"Precautions: {predicted_outputs['Precautions']}")
print(f"Recommended Food Intake: {predicted_outputs['Food Intake Recommendation']}")
print(f"Foods to Avoid: {predicted_outputs['Foods to Avoid']}")
print(f"Duration of Symptoms: {predicted_outputs['Duration of Symptoms']}")
print(f"Lifestyle Recommendations: {predicted_outputs['Lifestyle Recommendations']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transform(target_data[target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transform(target_data[target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transfor

Ensemble Model Metrics:
Disorder:
  Accuracy: 0.9016
  Precision: 0.9891
  Recall: 0.9016
  F1 Score: 0.8934
Treatment Recommendation:
  Accuracy: 0.9016
  Precision: 1.0000
  Recall: 0.9016
  F1 Score: 0.9016
Precautions:
  Accuracy: 0.9180
  Precision: 1.0000
  Recall: 0.9180
  F1 Score: 0.9180
Food Intake Recommendation:
  Accuracy: 0.9180
  Precision: 0.9836
  Recall: 0.9180
  F1 Score: 0.9180
Foods to Avoid:
  Accuracy: 0.9344
  Precision: 0.9680
  Recall: 0.9344
  F1 Score: 0.9409
Duration of Symptoms:
  Accuracy: 0.9016
  Precision: 1.0000
  Recall: 0.9016
  F1 Score: 0.9016
Lifestyle Recommendations:
  Accuracy: 0.9016
  Precision: 1.0000
  Recall: 0.9016
  F1 Score: 0.9016
Based on extracted symptoms from '':
Disorder: Pelvic Pain
Treatment Recommendation: NSAIDs, hormone therapy
Precautions: Avoid touching the warts, wear breathable clothing
Recommended Food Intake: Iron-rich foods, high-fiber foods for bowel health
Foods to Avoid: Sugary foods, processed carbs
Duration of Sy