In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

class DiseasePredictionModel:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.model = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            random_state=42
        )
        
    def train(self):
        try:
            df = pd.read_csv("C:\\Users\\Larren Pinto\\Downloads\\general_disease_diagnosis.csv")
            print("Dataset loaded successfully!")
            print(f"Shape of dataset: {df.shape}")
            print("\nColumns in your dataset:")
            print(df.columns.tolist())
            print("\nFirst few rows of your dataset:")
            print(df.head())
            
            numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
            print("\nNumeric columns in your dataset:")
            print(numeric_columns.tolist())
            
            features = numeric_columns.tolist()
            target_column = None
            
            for col in df.columns:
                if 'condition' in col.lower() or 'disease' in col.lower() or 'diagnosis' in col.lower():
                    target_column = col
                    if col in features:
                        features.remove(col)
            
            if not target_column:
                print("\nWarning: Could not automatically identify target column.")
                print("Available columns:", df.columns.tolist())
                target_column = input("Please enter the name of your target column: ")
            
            print("\nUsing these features:", features)
            print("Target column:", target_column)
            
        except FileNotFoundError:
            print("Error: The file 'general_disease_diagnosis.csv' was not found in the current directory.")
            return
        except Exception as e:
            print(f"Error loading the dataset: {str(e)}")
            return
        
        if df.isnull().sum().any():
            print("\nHandling missing values...")
            for column in df.columns:
                if df[column].dtype in ['float64', 'int64']:
                    df[column].fillna(df[column].mean(), inplace=True)
                else:
                    df[column].fillna(df[column].mode()[0], inplace=True)
        
        X = df[features]
        y = df[target_column]
        
        y_encoded = self.label_encoder.fit_transform(y)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        self.feature_names = features
        
        print("\nTraining the model...")
        self.model.fit(X_train_scaled, y_train)
        
        y_pred = self.model.predict(X_test_scaled)
        
        accuracy = np.mean(y_pred == y_test)
        print(f"\nModel Accuracy: {accuracy:.4f}")
        
        feature_importance = pd.DataFrame({
            'feature': features,
            'importance': self.model.feature_importances_
        })
        print("\nFeature Importance:")
        print(feature_importance.sort_values('importance', ascending=False))
    
    def predict(self, *feature_values):
        if not hasattr(self, 'feature_names'):
            raise ValueError("Model hasn't been trained yet!")
            
        input_data = np.array([feature_values])
        
        input_scaled = self.scaler.transform(input_data)
        
        prediction_encoded = self.model.predict(input_scaled)
        prediction = self.label_encoder.inverse_transform(prediction_encoded)
        
        probabilities = self.model.predict_proba(input_scaled)[0]
        conditions_with_probs = list(zip(
            self.label_encoder.classes_,
            probabilities
        ))
        sorted_predictions = sorted(
            conditions_with_probs,
            key=lambda x: x[1],
            reverse=True
        )
        
        return prediction[0], sorted_predictions
    
    def process_and_save_predictions(self):
        df = pd.read_csv("C:\\Users\\Larren Pinto\\Downloads\\general_disease_diagnosis.csv")
        
        df.to_csv('original_dataset.csv', index=False)
        
        df_completed = df.copy()
        
        for column in self.feature_names:
            if df_completed[column].isnull().any():
                df_completed[column].fillna(df_completed[column].mean(), inplace=True)
        
        predictions = []
        probabilities_list = []
        
        print("\nMaking predictions for all rows...")
        for _, row in df_completed[self.feature_names].iterrows():
            pred, probs = self.predict(*row)
            predictions.append(pred)
            probabilities_list.append(dict(probs))
        
        df_completed['predicted_condition'] = predictions
        
        for condition in self.label_encoder.classes_:
            df_completed[f'probability_{condition.replace(" ", "_")}'] = [
                probs.get(condition, 0) for probs in probabilities_list
            ]

        df_completed.to_csv('completed_dataset.csv', index=False)
        print("Completed dataset saved as 'completed_dataset.csv'")
        
        return df_completed

if __name__ == "__main__":
    print("Initializing and training the model...")
    model = DiseasePredictionModel()
    model.train()
    
    print("\nProcessing dataset and generating predictions...")
    completed_df = model.process_and_save_predictions()
    print("\nProcess completed successfully!")


Initializing and training the model...
Dataset loaded successfully!
Shape of dataset: (1000, 6)

Columns in your dataset:
['Patient_Name', 'Age', 'Weight_kg', 'Height_cm', 'Blood_Pressure_mmHg', 'Disease']

First few rows of your dataset:
       Patient_Name  Age  Weight_kg  Height_cm  Blood_Pressure_mmHg  \
0      Ramesh Patel   10         29         93                  102   
1     Sunita Pandey   12         21        103                  152   
2  Santosh Kulkarni   11         19        112                  154   
3       Swati Verma   32         80        152                   95   
4      Sudha Pandey   30         57        177                   95   

            Disease  
0    Kidney Disease  
1      Hypertension  
2  Thyroid Disorder  
3      Tuberculosis  
4      Hypertension  

Numeric columns in your dataset:
['Age', 'Weight_kg', 'Height_cm', 'Blood_Pressure_mmHg']

Using these features: ['Age', 'Weight_kg', 'Height_cm', 'Blood_Pressure_mmHg']
Target column: Disease

Handlin

In [24]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

class EnhancedDiseasePredictionModel:
    def __init__(self):
        self.pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=42)),
            ('model', RandomForestClassifier(random_state=42))
        ])
        self.grid_params = {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [5, 10, 15],
            'model__min_samples_split': [2, 5, 10]
        }
        
    def train(self):
        try:
            df = pd.read_csv("C:\\Users\\Larren Pinto\\Downloads\\general_disease_diagnosis.csv")
            # [Processing steps here...]
            X, y = df[features], df[target_column]
            
            # Apply Grid Search with Cross-Validation
            grid_search = GridSearchCV(self.pipeline, self.grid_params, cv=5, scoring='accuracy')
            grid_search.fit(X, y)
            best_model = grid_search.best_estimator_
            
            print(f"Best parameters: {grid_search.best_params_}")
            print("\nCross-validated model accuracy:", grid_search.best_score_)
            
            return best_model
        except FileNotFoundError:
            print("Error: The file 'general_disease_diagnosis.csv' was not found.")
            return None

EnhancedDiseasePredictionModel()



<__main__.EnhancedDiseasePredictionModel at 0x256f0d2fcb0>

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# Load dataset
file_path = "C:\\Users\\Larren Pinto\\Downloads\\general_disease_diagnosis.csv"
df = pd.read_csv(file_path)
df.dropna(inplace=True)
# Data preprocessing
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
target_column = next((col for col in df.columns if 'condition' in col.lower() or 
                      'disease' in col.lower() or 'diagnosis' in col.lower()), None)

# Fill missing values
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Prepare features and target
features = [col for col in numeric_columns if col != target_column]
X = df[features]
y = df[target_column]

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Address class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models with optimized hyperparameters
models = {
    "Random Forest": RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_split=3, class_weight='balanced', random_state=42),
    "SVM": SVC(kernel='rbf', C=10, gamma=0.1, class_weight='balanced', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7, weights='distance', metric='minkowski')
}

# Use GridSearchCV for hyperparameter tuning
param_grids = {
    "Random Forest": {
        'n_estimators': [100, 300, 500],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 3, 5]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1]
    },
    "KNN": {
        'n_neighbors': [5, 7, 9],
        'metric': ['euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
    }
}

# Train and evaluate each model with hyperparameter tuning
for model_name, model in models.items():
    print(f"\nTuning hyperparameters for {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"{model_name} Best F1 Score: {f1:.4f} with parameters: {grid_search.best_params_}")


Tuning hyperparameters for Random Forest...
Random Forest Best F1 Score: 0.3216 with parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500}

Tuning hyperparameters for SVM...
SVM Best F1 Score: 0.2330 with parameters: {'C': 10, 'gamma': 1}

Tuning hyperparameters for KNN...
KNN Best F1 Score: 0.2839 with parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
