In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import joblib

# Example data (replace with your actual dataset loading)
data = pd.read_csv("/content/drive/MyDrive/dataset.csv")
df = pd.DataFrame(data)

df.replace('0', pd.NA, inplace=True)
"""
# Encode labels using LabelEncoder
encoder_disease = LabelEncoder()
df['Disease'] = encoder_disease.fit_transform(df['Disease'])

encoder_symptoms = {}
for column in df.columns[1:]:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    encoder_symptoms[column] = encoder
"""
# Encoding categorical data
disease_encoder = LabelEncoder()
symptom_encoders = {f'Symptom_{i}': LabelEncoder() for i in range(1, 18)}

# Encode disease labels
df['Disease'] = disease_encoder.fit_transform(df['Disease'])

# Encode symptom labels
for column in df.columns[1:]:
    df[column] = df[column].astype(str).fillna('None')  # Replace NaN with 'None'
    df[column] = symptom_encoders[column].fit_transform(df[column])

# Split data into features (X) and target (y)
X = df.drop('Disease', axis=1)
y = df['Disease']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=disease_encoder.classes_)

# Print performance metrics
print(f'Accuracy: {accuracy*100:.2f}%')
print('Classification Report:\n', classification_rep)

joblib.dump(model, 'model.pkl')
joblib.dump(disease_encoder, 'disease_encoder.pkl')
joblib.dump(symptom_encoders, 'symptom_encoder.pkl')

Accuracy: 93.16%
Classification Report:
                                          precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.97      0.94      0.95        32
                                   AIDS       0.62      0.62      0.62        39
                                   Acne       0.70      0.78      0.74        41
                    Alcoholic hepatitis       1.00      1.00      1.00        36
                                Allergy       0.69      1.00      0.81        35
                              Arthritis       0.94      0.83      0.88        36
                       Bronchial Asthma       1.00      1.00      1.00        44
                   Cervical spondylosis       1.00      0.88      0.93        32
                            Chicken pox       1.00      1.00      1.00        35
                    Chronic cholestasis       1.00      1.00      1.00        30
                            Common Cold       1.00      1.00      1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['symptom_encoder.pkl']

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores: [0.92888244 0.94049347 0.9245283  0.94629898 0.92877907]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')
eclf.fit(X_train, y_train)
accuracy = accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(accuracy)

0.9315718157181572


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
X.columns
joblib.dump(X.columns, 'X_column.pkl')

['X_column.pkl']

In [None]:
# Example of predicting a new instance with 17 symptoms
new_symptoms =   ['skin_rash', 'itching', 'nodal_skin_eruptions', 'dischromic_patches', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
new_data = pd.DataFrame([new_symptoms], columns=X.columns)

# Encode new instance using pre-fitted encoders
for column in new_data.columns:
    if column in symptom_encoders:
        encoder = symptom_encoders[column]
        # Handle unknown labels with a default value
        new_data[column] = new_data[column].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1)

predicted_label = model.predict(new_data)
predicted_disease = disease_encoder.inverse_transform(predicted_label)[0]

print(f'Predicted Disease: {predicted_disease}')

Predicted Disease: Tuberculosis
