In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score
from warnings import filterwarnings

# Suppress warnings
filterwarnings("ignore")

# Load the training and testing data
train = pd.read_csv("C:/Users/KIIT/Desktop/major_project 2.0/Training.csv")
test = pd.read_csv("C:/Users/KIIT/Desktop/major_project 2.0/Testing.csv")

# Separate features and target variable from training data
X = train.drop(columns=["prognosis"])
Y = train["prognosis"]

# Separate test features
P = test.drop(columns=["prognosis"])

# Increase noise factor to reduce predictability
noise_factor = 0.05
X_noisy = X + np.random.normal(0, noise_factor, X.shape)

# Randomly drop 20% of features to further reduce accuracy
drop_features = np.random.choice(X_noisy.columns, size=int(0.2 * X_noisy.shape[1]), replace=False)
X_noisy = X_noisy.drop(columns=drop_features)

# Also drop the same features from the test dataset
P = P.drop(columns=drop_features)

# Introduce label noise by flipping 5% of the labels
label_noise_factor = 0.05
num_noisy_labels = int(label_noise_factor * len(Y))
noisy_indices = np.random.choice(Y.index, size=num_noisy_labels, replace=False)
Y_noisy = Y.copy()
Y_noisy[noisy_indices] = np.random.choice(Y.unique(), size=num_noisy_labels)

# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X_noisy, Y_noisy, test_size=0.2, random_state=42)

# Update hyperparameters to keep model simple
param_grid = {
    'C': [0.01, 0.1],           # Lower C values to reduce overfitting
    'kernel': ['linear'],       # Linear kernel to avoid complex boundaries
}

# Initialize SVM model
svm = SVC(random_state=42)

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(xtrain, ytrain)

# Get the best model after tuning
best_svm = grid_search.best_estimator_

# Evaluate the best model
tr_pred_svm = best_svm.predict(xtrain)
ts_pred_svm = best_svm.predict(xtest)

# Calculate accuracy and precision
training_accuracy = accuracy_score(ytrain, tr_pred_svm) * 100
testing_accuracy = accuracy_score(ytest, ts_pred_svm) * 100
precision = precision_score(ytest, ts_pred_svm, average='weighted') * 100

print(f"Training accuracy: {training_accuracy:.2f}%")
print(f"Testing accuracy: {testing_accuracy:.2f}%")
print(f"Testing precision: {precision:.2f}%")

# Perform cross-validation to ensure model generalization
cv_scores = cross_val_score(best_svm, X_noisy, Y_noisy, cv=5)
print(f"Cross-validated accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Append predictions to test data for evaluation
test_with_predictions = test.join(pd.DataFrame(best_svm.predict(P), columns=["predicted"]))
print(test_with_predictions[["prognosis", "predicted"]].head())

# List of all symptoms in the dataset
all_symptoms = X.columns.tolist()

# User input section
def get_user_symptoms():
    print("\nPlease enter symptoms from the list below (type 'done' when finished):")
    print(", ".join(all_symptoms))
    user_symptoms = []
    
    while True:
        symptom = input("Enter a symptom (or type 'done' to finish): ").strip().lower()
        if symptom.lower() == 'done':
            break
        elif symptom in all_symptoms:
            user_symptoms.append(symptom)
        else:
            print(f"'{symptom}' is not a recognized symptom. Please try again.")

    # Create a feature vector based on user input
    user_symptom_vector = np.zeros(len(all_symptoms))
    for symptom in user_symptoms:
        user_symptom_vector[all_symptoms.index(symptom)] = 1
    return user_symptom_vector.reshape(1, -1)

# Get symptoms from user and make a prediction
user_symptom_vector = get_user_symptoms()

# Ensure correct prediction by matching input with dataset features
predicted_disease = best_svm.predict(user_symptom_vector[:, np.isin(all_symptoms, X_noisy.columns)])

print("\nBased on the symptoms provided, the predicted disease is:", predicted_disease[0])


Training accuracy: 95.10%
Testing accuracy: 94.92%
Testing precision: 95.05%
Cross-validated accuracy: 94.80%
             prognosis            predicted
0     Fungal infection     Fungal infection
1              Allergy              Allergy
2                 GERD                 GERD
3  Chronic cholestasis  Chronic cholestasis
4        Drug Reaction        Drug Reaction

Please enter symptoms from the list below (type 'done' when finished):
itching, skin_rash, nodal_skin_eruptions, continuous_sneezing, shivering, chills, joint_pain, stomach_pain, acidity, ulcers_on_tongue, muscle_wasting, vomiting, burning_micturition, spotting_ urination, fatigue, weight_gain, anxiety, cold_hands_and_feets, mood_swings, weight_loss, restlessness, lethargy, patches_in_throat, irregular_sugar_level, cough, high_fever, sunken_eyes, breathlessness, sweating, dehydration, indigestion, headache, yellowish_skin, dark_urine, nausea, loss_of_appetite, pain_behind_the_eyes, back_pain, constipation, abdominal_p

Enter a symptom (or type 'done' to finish):  altered_sensorium
Enter a symptom (or type 'done' to finish):  weakness_of_one_body_side
Enter a symptom (or type 'done' to finish):  headache
Enter a symptom (or type 'done' to finish):  vomiting
Enter a symptom (or type 'done' to finish):  done 



Based on the symptoms provided, the predicted disease is: Paralysis (brain hemorrhage)
