In [None]:
!pip install boto3

In [5]:
import pandas as pd
import boto3
import io

class DataSet:
    def __init__(self):
        ACCESS_KEY = 'ENTER_ACCESS_KEY'
        SECRET_KEY = 'ENTER_SECRET_KEY'
        session = boto3.Session(
        aws_access_key_id=ACCESS_KEY,
        aws_secret_access_key=SECRET_KEY)
        # Initializing file paths for the MIMIC-III data
        self.s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
        self.bucket_name = 'mimic-iv-dataset'
        self.data_files = {
            "admissions_df": "ADMISSIONS.csv.gz",
            "diagnosis_codes_df": "D_ICD_DIAGNOSES.csv.gz",
            "diagnosis_df": "DIAGNOSES_ICD.csv.gz",
            "notes_df": "NOTEEVENTS.csv.gz",
            "patients_df": "PATIENTS.csv.gz",
            "prescription_df": "PRESCRIPTIONS.csv.gz"
        }
        # Dictionary to hold DataFrames
        self.dataframes = {}

    def load_data(self, nrows=1000):
        # Load each CSV file from S3 into a DataFrame
        for df_name, file_key in self.data_files.items():
            try:

                obj = self.s3.get_object(Bucket=self.bucket_name, Key=file_key)
                self.dataframes[df_name] = pd.read_csv(
                    io.BytesIO(obj['Body'].read()),
                    compression='gzip',
                    nrows=nrows
                )
                print(f"{df_name} loaded successfully.")
            except Exception as e:
                print(f"Error loading {file_key}: {e}")

    def get_dataframe(self, df_name):
        # Return a specific DataFrame by name
        return self.dataframes.get(df_name, None)

    def print_head(self):
        # Print the first 5 rows of each DataFrame
        for df_name, df in self.dataframes.items():
            if df is not None:
                print(f"\nDataFrame: {df_name}")
                print(df.head())
            else:
                print(f"\nDataFrame: {df_name} could not be loaded.")

data_set = DataSet()
data_set.load_data(nrows=100)
data_set.print_head()

admissions_df loaded successfully.
diagnosis_codes_df loaded successfully.
diagnosis_df loaded successfully.
notes_df loaded successfully.
patients_df loaded successfully.
prescription_df loaded successfully.

DataFrame: admissions_df
   row_id  subject_id  hadm_id            admittime            dischtime  \
0       1           2   163353  2138-07-17 19:04:00  2138-07-21 15:48:00   
1       2           3   145834  2101-10-20 19:08:00  2101-10-31 13:58:00   
2       4           5   178980  2103-02-02 04:31:00  2103-02-04 12:15:00   
3       6           7   118037  2121-05-23 15:05:00  2121-05-27 11:57:00   
4       7           8   159514  2117-11-20 10:22:00  2117-11-24 14:20:00   

  deathtime admission_type         admission_location discharge_location  \
0       NaN        NEWBORN  PHYS REFERRAL/NORMAL DELI               HOME   
1       NaN      EMERGENCY       EMERGENCY ROOM ADMIT                SNF   
2       NaN        NEWBORN  PHYS REFERRAL/NORMAL DELI               HOME   
3   

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
class NaiveBayes:
    df = pd.DataFrame()

    def __init__(self):
        self.counter = CountVectorizer()

        # Use OneVsRestClassifier to handle multi-label classification
        self.model = OneVsRestClassifier(MultinomialNB())

        self.data = DataSet()

        self.data.load_data(nrows=5000)

        self.mlb = MultiLabelBinarizer()

    # Extract all the needed data and store in df
    def process_data(self):
        admissions = self.data.dataframes["admissions_df"]
        self.df['id'] = admissions['subject_id']

        notes = self.data.dataframes["notes_df"]
        # Get every text from notes_df associated with the subject id
        id_notes = {}
        for _, row in notes.iterrows():
            subject_id = row['subject_id']
            text = row['text']

            if subject_id not in id_notes:
                id_notes[subject_id] = text
            else:
                id_notes[subject_id] += ' ' + text
        # Store each set of notes to its id
        self.df['notes'] = self.df['id'].map(id_notes)

        # Store all the diagnosis with each subject id
        diagnosis = self.data.dataframes["diagnosis_df"]
        id_diagnosis = {}

        for _, row in diagnosis.iterrows():
            subject_id = row['subject_id']
            diag_code = row['icd9_code']

            if subject_id not in id_diagnosis:
                id_diagnosis[subject_id] = [diag_code]
            else:
                id_diagnosis[subject_id].append(diag_code)

        self.df['diagnosis_codes'] = self.df['id'].map(id_diagnosis)

        print(self.df.head())

    def train(self):
        # Drop rows where 'notes' or 'diagnosis_codes' are NaN
        self.df.dropna(subset=['notes', 'diagnosis_codes'], inplace=True)

        # Replace non-list entries with an empty list
        self.df['diagnosis_codes'] = self.df['diagnosis_codes'].apply(
            lambda x: x if isinstance(x, list) else []
        )

        # Check if there are still any NaN values
        if self.df['diagnosis_codes'].isna().sum() > 0:
            print("Warning: There are still NaN values in 'diagnosis_codes' after processing.")

        X = self.df['notes']
        y = self.mlb.fit_transform(self.df['diagnosis_codes'])

        # Train-test split ensuring all classes are represented
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        X_train_vectorized = self.counter.fit_transform(X_train)
        X_test_vectorized = self.counter.transform(X_test)

        print(X_train_vectorized.shape)
        print(X_test_vectorized.shape)
        print(y_train.shape)
        print(y_test.shape)

        self.model.fit(X_train_vectorized, y_train)

        y_pred = self.model.predict(X_test_vectorized)

        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    def predict(self, X):
        X_vectorized = self.counter.transform(X)

        predictions = self.model.predict(X_vectorized)

        diagnosis_codes = self.mlb.inverse_transform(predictions)

        return diagnosis_codes

if __name__ == "__main__":
    nb_model = NaiveBayes()
    nb_model.process_data()
    nb_model.train()
        # Test nurse notes
    nurse_notes = [
        "Patient is a 55-year-old male presenting with a history of hypertension and diabetes. Complains of sharp, intermittent chest pain radiating to the left arm. Denies nausea or shortness of breath. EKG shows normal sinus rhythm.",
        "A 68-year-old female with a history of asthma presents with a 3-day history of fever and dry cough. No shortness of breath, but reports fatigue. Chest X-ray suggests mild consolidation.",
        "Post-op patient after knee replacement surgery. Reports mild pain in the surgical site, 4/10 on the pain scale. No signs of infection, incision is clean and dry. Patient is ambulating with assistance.",
        "Patient is a 40-year-old male presenting with acute lower abdominal pain and bloating. Reports nausea but no vomiting. Last bowel movement was 2 days ago. CT scan is pending for suspected appendicitis.",
        "Patient is a 60-year-old female with Type 2 diabetes, presenting with a blood glucose level of 220 mg/dL. Complaints of increased thirst and urination. Current medications: metformin and insulin.",
        "Patient is a 33-year-old male presenting with a 1-week history of lower back pain after lifting heavy objects. Pain is dull and constant, worsened with movement. Denies leg weakness or numbness.",
        "Patient is a 45-year-old female with a history of hypertension. Complains of daily headaches and blurred vision, especially in the morning. No known history of migraines. BP is elevated at 160/100."
    ]

    # Make predictions for each nurse note
    predictions = nb_model.predict(nurse_notes)
    
    for i, prediction in enumerate(predictions):
        print(f"Prediction for Nurse Note {i+1}: {prediction}")
    

admissions_df loaded successfully.
diagnosis_codes_df loaded successfully.
diagnosis_df loaded successfully.
notes_df loaded successfully.
patients_df loaded successfully.
prescription_df loaded successfully.
   id notes                                    diagnosis_codes
0   2   NaN                                [V3001, V053, V290]
1   3   NaN  [0389, 78559, 5849, 4275, 41071, 4280, 6826, 4...
2   5   NaN                                [V3000, V053, V290]
3   7   NaN                                [V3001, V053, V290]
4   8   NaN              [V3001, 7706, 7746, V290, V502, V053]
(13, 2011)
(5, 2011)
(13, 93)
(5, 93)
Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         0
           5

