<a href="https://colab.research.google.com/github/shukla-muskan/Srijan-Kumbh-Sahyogi/blob/ML-model/final_disease_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import files
uploaded = files.upload()


Saving augmented_medicine_prediction_dataset_google_verified.csv to augmented_medicine_prediction_dataset_google_verified.csv


In [12]:
import pandas as pd

df = pd.read_csv("augmented_medicine_prediction_dataset_google_verified.csv")
df.head()


Unnamed: 0,Disease,Medicine Name,Composition,Side Effects,Manufacturer,Excellent %,Average %,Poor %,Month,Normalized Disease
0,Common Cold,Consult physician,Paracetamol (112mg),Vomiting,Manufacturer 9,55%,17%,4%,January,common cold
1,Flu,Consult physician,Paracetamol (446mg),Vomiting,Manufacturer 29,74%,12%,18%,January,flu
2,Bronchitis,Amoxicillin,Paracetamol (115mg),Nausea,Manufacturer 7,54%,26%,19%,January,bronchitis
3,Sinusitis,Amoxicillin-Clavulanate,Ibuprofen (466mg),Vomiting,Manufacturer 23,74%,23%,7%,January,sinusitis
4,Tonsillitis,Amoxicillin,Azithromycin (103mg),Headache,Manufacturer 23,67%,20%,8%,January,tonsillitis


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

# Load the dataset
df = pd.read_csv('augmented_medicine_prediction_dataset_google_verified.csv')

# Strip whitespace in Month column
df['Month'] = df['Month'].str.strip()

# Handle missing values (drop for simplicity)
df = df.dropna()

# Create fever-related label
fever_related = ['Fever', 'Viral Fever', 'Typhoid Fever', 'Dehydration Fever', 'Dengue', 'Malaria',
                'Common Cold', 'Flu', 'Influenza', 'Typhoid']
df['is_fever_related'] = df['Disease'].apply(lambda x: 1 if x in fever_related else 0)

# Convert Month to numerical and seasonal features
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
df['month_num'] = df['Month'].apply(lambda x: month_order.index(x) + 1)

def get_season(month):
    if month in ['December', 'January', 'February']:
        return 'Winter'
    elif month in ['March', 'April', 'May']:
        return 'Spring'
    elif month in ['June', 'July', 'August']:
        return 'Summer'
    else:
        return 'Fall'

df['Season'] = df['Month'].apply(get_season)

# Convert percentages to float
for col in ['Excellent %', 'Average %', 'Poor %']:
    df[col] = df[col].str.rstrip('%').astype(float)

# Extract medicine strength
df['medicine_strength'] = df['Medicine Name'].str.extract(r'(\d+)mg').astype(float)

# Optional: Filter out medicines that appear rarely
min_samples = 3
medicine_counts = df['Medicine Name'].value_counts()
common_medicines = medicine_counts[medicine_counts >= min_samples].index

if len(common_medicines) > 0:
    df = df[df['Medicine Name'].isin(common_medicines)]
else:
    print("⚠️ Warning: No medicines meet the minimum threshold. Skipping filtering.")

# Create medicine-disease mapping
medicine_disease_map = df.groupby(['Medicine Name', 'Disease']).size().reset_index(name='count')
medicine_disease_map = medicine_disease_map.sort_values(['Medicine Name', 'count'], ascending=[True, False])

# Create a lookup dictionary for quick reference
medicine_to_diseases = {}
for _, row in medicine_disease_map.iterrows():
    medicine = row['Medicine Name']
    disease = row['Disease']
    count = row['count']

    if medicine not in medicine_to_diseases:
        medicine_to_diseases[medicine] = []

    medicine_to_diseases[medicine].append((disease, count))

# -------------------
# Fever Prediction Model
# -------------------
X = df[['month_num', 'Excellent %', 'Average %', 'Poor %', 'medicine_strength']]
y = df['is_fever_related']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

fever_model = RandomForestClassifier(n_estimators=100, random_state=42)
fever_model.fit(X_train, y_train)

y_pred = fever_model.predict(X_test)

print("\nFever Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_pred, y_test))

# -------------------
# Medicine Name Prediction Model
# -------------------
X_med = df[['month_num', 'Season', 'is_fever_related', 'Excellent %', 'Average %', 'Poor %']]
y_med = df['Medicine Name']

# Encoding
categorical_cols = ['Season']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough'
)

medicine_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

X_med_train, X_med_test, y_med_train, y_med_test = train_test_split(X_med, y_med, test_size=0.25, random_state=42)
medicine_pipeline.fit(X_med_train, y_med_train)

y_med_pred = medicine_pipeline.predict(X_med_test)
print("\nMedicine Prediction Model Accuracy:", accuracy_score(y_med_test, y_med_pred))

# -------------------
# Prediction Functions
# -------------------
def predict_medicines_needed(month, season=None):
    if season is None:
        season = get_season(month)
    month_num = month_order.index(month) + 1

    all_medicines = df['Medicine Name'].unique()
    predictions = []

    for med in all_medicines:
        med_data = df[df['Medicine Name'] == med]
        if len(med_data) == 0:
            continue

        avg_excellent = med_data['Excellent %'].mean()
        avg_average = med_data['Average %'].mean()
        avg_poor = med_data['Poor %'].mean()

        test_case = pd.DataFrame({
            'month_num': [month_num],
            'Season': [season],
            'is_fever_related': [1],
            'Excellent %': [avg_excellent],
            'Average %': [avg_average],
            'Poor %': [avg_poor]
        })

        prob = medicine_pipeline.predict_proba(test_case)

        top_classes = medicine_pipeline.named_steps['classifier'].classes_
        for idx, med_class in enumerate(top_classes):
            if med_class == med:
                predictions.append((med, prob[0][idx]))

    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions

def generate_inventory_recommendations():
    recommendations = {}
    for month in month_order:
        top_meds = predict_medicines_needed(month)
        recommendations[month] = [med for med, prob in top_meds[:5]]
    return recommendations

# -------------------
# Show Predictions with Associated Diseases
# -------------------
print("\nPredicted Medicines Needed by Month with Associated Diseases:")
for month in ['January', 'April', 'July', 'October']:
    print(f"\n{month} Predictions:")
    predictions = predict_medicines_needed(month)

    for med, prob in predictions[:5]:
        print(f"{med}: {prob:.2f} probability")
        # Print associated diseases for this medicine
        if med in medicine_to_diseases:
            diseases = medicine_to_diseases[med]
            print(f"   Associated diseases:")
            for disease, count in diseases[:3]:  # Show top 3 diseases
                print(f"   - {disease} ({count} occurrences)")
        else:
            print("   No disease association data available")

# -------------------
# Inventory Recommendations with Diseases
# -------------------
inventory_recommendations = generate_inventory_recommendations()
print("\nMonthly Inventory Recommendations with Associated Diseases:")
for month, medicines in inventory_recommendations.items():
    print(f"\n{month}:")
    for med in medicines:
        print(f"- {med}")
        # Print associated diseases for this medicine
        if med in medicine_to_diseases:
            diseases = medicine_to_diseases[med]
            print(f"  For treating:")
            for disease, count in diseases[:2]:  # Show top 2 diseases
                print(f"  - {disease}")

# -------------------
# Save Models
# -------------------
with open('fever_medicine_predictor.pkl', 'wb') as f:
    pickle.dump(fever_model, f)

with open('medicine_name_predictor.pkl', 'wb') as f:
    pickle.dump(medicine_pipeline, f)

print("\n✅ Models saved successfully!")



Fever Model Accuracy: 0.637883008356546
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.71      0.77       303
           1       0.14      0.27      0.19        56

    accuracy                           0.64       359
   macro avg       0.49      0.49      0.48       359
weighted avg       0.73      0.64      0.68       359


Medicine Prediction Model Accuracy: 0.6629526462395543

Predicted Medicines Needed by Month with Associated Diseases:

January Predictions:
Consult physician: 0.99 probability
   Associated diseases:
   - Pneumonia (124 occurrences)
   - Common Cold (122 occurrences)
   - Asthma (119 occurrences)
Amoxicillin: 0.00 probability
   Associated diseases:
   - Tonsillitis (134 occurrences)
   - Bronchitis (131 occurrences)
   - Otitis Media (1 occurrences)
Amoxicillin-Clavulanate: 0.00 probability
   Associated diseases:
   - Sinusitis (131 occurrences)
Depends on cause (e.g., supportive care, antivirals, 