In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
import fitz  # PyMuPDF for PDF text extraction

# Load dataset
data = pd.read_csv('gynecological_conditions.csv')

# Define input features and targets
X = data['Symptoms']  # Input features (Symptoms)

# Targets for prediction
targets = [
    'Disorder',
    'Treatment Recommendation',
    'Precautions',
    'Food Intake Recommendation',
    'Foods to Avoid',
    'Duration of Symptoms',
    'Lifestyle Recommendations'
]

# Encode the target columns
target_data = data[targets]
le_targets = {}
for target in targets:
    le_targets[target] = LabelEncoder()
    target_data[target] = le_targets[target].fit_transform(target_data[target])

# Split the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train = target_data.loc[X_train.index]
y_test = target_data.loc[X_test.index]

# Vectorize the symptoms data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize classifiers
logreg = LogisticRegression(max_iter=1000)
svm = SVC(probability=True)
rf = RandomForestClassifier(n_estimators=100)

# Define hyperparameter grids
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
}
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}

# Dictionary to store final ensemble models for each target
models = {}
ensemble_metrics = {}

# Train and evaluate ensemble models with metrics
for target in targets:
    # Tune each model separately for the current target
    grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=3, n_jobs=-1, verbose=1)
    grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=-1, verbose=1)

    # Fit models using GridSearchCV to find best parameters
    grid_search_svm.fit(X_train_vectorized, y_train[target])
    grid_search_rf.fit(X_train_vectorized, y_train[target])

    # Retrieve the best models
    best_svm = grid_search_svm.best_estimator_
    best_rf = grid_search_rf.best_estimator_

    # Print best hyperparameters
    print(f"Best parameters for SVM for target {target}:", grid_search_svm.best_params_)
    print(f"Best parameters for Random Forest for target {target}:", grid_search_rf.best_params_)

    # Voting classifier for ensemble model
    voting_model = VotingClassifier(estimators=[
        ('logreg', logreg),
        ('svm', best_svm),
        ('rf', best_rf)
    ], voting='soft')
    
    # Train ensemble model for each target
    voting_model.fit(X_train_vectorized, y_train[target])

    # Predict on the test set
    y_pred = voting_model.predict(X_test_vectorized)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test[target], y_pred)
    precision = precision_score(y_test[target], y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test[target], y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test[target], y_pred, average='weighted', zero_division=1)

    # Store metrics
    ensemble_metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }
    models[target] = voting_model  # Store trained ensemble model

    # Display confusion matrix
    cm = confusion_matrix(y_test[target], y_pred)
    print(f"Confusion Matrix for {target}:\n{cm}")

# Print ensemble model metrics
print("\nEnsemble Model Metrics:")
for target, metrics in ensemble_metrics.items():
    print(f"{target}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1_score']:.4f}")

# Save the trained ensemble models and vectorizer to files
with open('ensemble_model.pkl', 'wb') as model_file:
    pickle.dump(models, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Prediction function using the ensemble model
def predict_outputs(symptoms):
    # Vectorize the input symptoms
    input_vectorized = vectorizer.transform([symptoms])
    predictions = {}
    for target in targets:
        predicted_encoded = models[target].predict(input_vectorized)
        predictions[target] = le_targets[target].inverse_transform(predicted_encoded)[0]
    return predictions

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page in pdf_document:
            text += page.get_text()
    return text

# User input for PDF file
pdf_file_path = input("Please enter the path to the PDF file: ")

# Extract symptoms from the PDF
extracted_symptoms = extract_text_from_pdf(pdf_file_path)

# Predict outputs based on the extracted symptoms
predicted_outputs = predict_outputs(extracted_symptoms)

# Print the predicted outputs in the specified format
print(f"Based on extracted symptoms from '{pdf_file_path}':")
print(f"Disorder: {predicted_outputs['Disorder']}")
print(f"Treatment Recommendation: {predicted_outputs['Treatment Recommendation']}")
print(f"Precautions: {predicted_outputs['Precautions']}")
print(f"Recommended Food Intake: {predicted_outputs['Food Intake Recommendation']}")
print(f"Foods to Avoid: {predicted_outputs['Foods to Avoid']}")
print(f"Duration of Symptoms: {predicted_outputs['Duration of Symptoms']}")
print(f"Lifestyle Recommendations: {predicted_outputs['Lifestyle Recommendations']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transform(target_data[target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transform(target_data[target])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data[target] = le_targets[target].fit_transfor

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters for SVM for target Disorder: {'C': 0.1, 'gamma': 'scale'}
Best parameters for Random Forest for target Disorder: {'max_depth': None, 'n_estimators': 50}
Confusion Matrix for Disorder:
[[105   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0 102   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0 110   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0  87   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0 114   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0 104   0   0   0   0   0   0   0   0   0   0   0


FileDataError: '.' is no file