# Disease Prediction Analysis

This notebook demonstrates the analysis and model development for general disease prediction based on symptoms.

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

# Set plot style
plt.style.use('ggplot')
sns.set(style="darkgrid")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

## Data Loading and Exploration

In [2]:
# Load the dataset
df = pd.read_csv('data/disease_symptoms.csv')

# Display the first few rows
print("Dataset Shape:", df.shape)
df.head()

In [3]:
# Check for missing values
print("Missing values in each column:")
df.isnull().sum()

In [4]:
# Get basic statistics
print("Disease distribution:")
disease_counts = df['Disease'].value_counts()
print(f"Total number of unique diseases: {len(disease_counts)}")
disease_counts.head(10)

In [5]:
# Get symptom information
print("Symptom distribution:")
all_symptoms = []
for symptoms in df['Symptoms'].str.split(','):
    all_symptoms.extend(symptoms)

symptom_counts = pd.Series(all_symptoms).value_counts()
print(f"Total number of unique symptoms: {len(symptom_counts)}")
symptom_counts.head(10)

## Data Visualization

In [6]:
# Visualize top diseases
plt.figure(figsize=(12, 8))
disease_counts.head(15).plot(kind='bar', color='purple')
plt.title('Top 15 Most Common Diseases', fontsize=16)
plt.xlabel('Disease', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../public/visualizations/disease_prevalence.png')
plt.show()

In [7]:
# Visualize top symptoms
plt.figure(figsize=(12, 8))
symptom_counts.head(20).plot(kind='bar', color='blue')
plt.title('Top 20 Most Common Symptoms', fontsize=16)
plt.xlabel('Symptom', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../public/visualizations/disease_symptom_frequency.png')
plt.show()

In [8]:
# Create a symptom-disease matrix for correlation analysis
top_diseases = disease_counts.head(10).index.tolist()
top_symptoms = symptom_counts.head(15).index.tolist()

# Create a matrix of symptoms vs diseases
disease_symptom_matrix = np.zeros((len(top_diseases), len(top_symptoms)))

for i, disease in enumerate(top_diseases):
    disease_df = df[df['Disease'] == disease]
    for j, symptom in enumerate(top_symptoms):
        # Count how many times this symptom appears for this disease
        count = sum(disease_df['Symptoms'].str.contains(symptom))
        disease_symptom_matrix[i, j] = count

# Create a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(disease_symptom_matrix, annot=True, fmt='g', cmap='viridis',
            xticklabels=top_symptoms, yticklabels=top_diseases)
plt.title('Disease-Symptom Correlation Heatmap', fontsize=16)
plt.xlabel('Symptoms', fontsize=14)
plt.ylabel('Diseases', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../public/visualizations/disease_correlation.png')
plt.show()

In [9]:
# Perform symptom clustering analysis
from sklearn.manifold import TSNE

# Create a symptom co-occurrence matrix
top_50_symptoms = symptom_counts.head(50).index.tolist()
symptom_cooccurrence = np.zeros((len(top_50_symptoms), len(top_50_symptoms)))

for symptoms_list in df['Symptoms'].str.split(','):
    for i, symptom1 in enumerate(top_50_symptoms):
        if symptom1 in symptoms_list:
            for j, symptom2 in enumerate(top_50_symptoms):
                if symptom2 in symptoms_list and i != j:
                    symptom_cooccurrence[i, j] += 1

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
symptom_tsne = tsne.fit_transform(symptom_cooccurrence)

# Plot the symptom clusters
plt.figure(figsize=(14, 10))
plt.scatter(symptom_tsne[:, 0], symptom_tsne[:, 1], alpha=0.7, s=100)

# Add labels for the top 20 symptoms
for i, symptom in enumerate(top_50_symptoms[:20]):
    plt.annotate(symptom, (symptom_tsne[i, 0], symptom_tsne[i, 1]), fontsize=9)

plt.title('Symptom Clusters based on Co-occurrence', fontsize=16)
plt.xlabel('t-SNE Dimension 1', fontsize=14)
plt.ylabel('t-SNE Dimension 2', fontsize=14)
plt.tight_layout()
plt.savefig('../public/visualizations/symptom_clusters.png')
plt.show()

## Model Development

In [10]:
# Prepare the data for modeling
# Convert symptoms to a binary feature matrix
all_unique_symptoms = list(set(all_symptoms))
print(f"Total unique symptoms for modeling: {len(all_unique_symptoms)}")

# Create a binary feature matrix
X = np.zeros((len(df), len(all_unique_symptoms)))

for i, symptom_list in enumerate(df['Symptoms'].str.split(',')):
    for symptom in symptom_list:
        if symptom in all_unique_symptoms:
            j = all_unique_symptoms.index(symptom)
            X[i, j] = 1

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(df['Disease'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [11]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_[:10]))

In [12]:
# Feature importance analysis
feature_importance = rf_model.feature_importances_
feature_names = np.array(all_unique_symptoms)

# Sort features by importance
sorted_idx = np.argsort(feature_importance)[::-1]
top_features = feature_names[sorted_idx][:20]
top_importance = feature_importance[sorted_idx][:20]

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(range(len(top_importance)), top_importance, align='center', color='green')
plt.yticks(range(len(top_importance)), top_features)
plt.xlabel('Feature Importance')
plt.ylabel('Symptom')
plt.title('Top 20 Most Important Symptoms for Disease Prediction')
plt.tight_layout()
plt.savefig('../public/visualizations/disease_feature_importance.png')
plt.show()

## Model Deployment

In [13]:
# Function to predict disease based on symptoms
def predict_disease(symptoms_list):
    # Create a feature vector
    X_input = np.zeros(len(all_unique_symptoms))
    for symptom in symptoms_list:
        if symptom in all_unique_symptoms:
            j = all_unique_symptoms.index(symptom)
            X_input[j] = 1
    
    # Reshape for prediction
    X_input = X_input.reshape(1, -1)
    
    # Get prediction and probability
    prediction = rf_model.predict(X_input)
    probabilities = rf_model.predict_proba(X_input)
    
    # Get the disease name and probability
    disease_idx = prediction[0]
    disease_name = le.inverse_transform([disease_idx])[0]
    probability = probabilities[0][disease_idx]
    
    return {
        'disease': disease_name,
        'probability': probability,
        'top_symptoms': top_features.tolist()
    }

# Test the prediction function
test_symptoms = ['fever', 'cough', 'fatigue']
prediction_result = predict_disease(test_symptoms)
print(f"Predicted Disease: {prediction_result['disease']}")
print(f"Probability: {prediction_result['probability']:.4f}")

In [14]:
# Save the model for deployment
import pickle

# Create a dictionary with all necessary components
model_components = {
    'model': rf_model,
    'label_encoder': le,
    'all_symptoms': all_unique_symptoms,
    'top_symptoms': top_features.tolist()
}

# Save to file
with open('disease_prediction_model.pkl', 'wb') as f:
    pickle.dump(model_components, f)

print("Model saved successfully!")

## Conclusion

In this notebook, we've developed a machine learning model to predict diseases based on symptoms. The model achieves good accuracy and can be used to provide preliminary disease predictions based on user-reported symptoms.

Key findings:
1. Certain symptoms are highly indicative of specific diseases
2. Symptoms often cluster together, suggesting common underlying conditions
3. The Random Forest model provides good accuracy for disease prediction

Next steps:
- Collect more data to improve model accuracy
- Implement a confidence threshold for predictions
- Add severity assessment for symptoms
- Integrate with a user-friendly interface for symptom input