In [7]:
# ============================
# Milestone 1: CSV Dataset Preprocessing
# AI/ML-Based Personalized Diet Plan Generator
# ============================

# ----------------------------
# Cell 1: Install Required Libraries
# ----------------------------
!pip install pandas matplotlib seaborn --quiet


In [8]:
# ----------------------------
# Cell 2: Import Libraries
# ----------------------------
import pandas as pd
import os
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
# ----------------------------
# Cell 3: Unzip and Verify CSV Files
# ----------------------------
zip_files = ["/content/diabetes.zip", "/content/healthcare.zip"]
for z in zip_files:
    with ZipFile(z, 'r') as zip_ref:
        extract_folder = "/content/" + z.split("/")[-1].replace(".zip","")
        zip_ref.extractall(extract_folder)
        print(f"Extracted {z} to {extract_folder}")

# List contents of extracted folders
for folder in ["/content/diabetes","/content/healthcare"]:
    print(f"\nFiles in {folder}:")
    print(os.listdir(folder))


Extracted /content/diabetes.zip to /content/diabetes
Extracted /content/healthcare.zip to /content/healthcare

Files in /content/diabetes:
['diabetes.csv']

Files in /content/healthcare:
['healthcare_dataset.csv']


In [10]:
# ----------------------------
# Cell 4: Load and Understand Pima Diabetes Dataset
# ----------------------------
diabetes_csv_path = "/content/diabetes/diabetes.csv"
df_diabetes = pd.read_csv(diabetes_csv_path)

print("Pima Diabetes Dataset Preview:")
display(df_diabetes.head())

print("\nDataset Info:")
df_diabetes.info()

# Check for missing or zero values
print("\nMissing or Zero Values Count:")
print((df_diabetes==0).sum())


Pima Diabetes Dataset Preview:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

Missing or Zero Values Count:
Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
Di

In [11]:
# ----------------------------
# Cell 5: Clean Pima Diabetes Dataset
# ----------------------------
# Replace 0 in invalid columns with NaN
cols_to_clean = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
df_diabetes[cols_to_clean] = df_diabetes[cols_to_clean].replace(0, pd.NA)

# Fill missing values with median
df_diabetes.fillna(df_diabetes.median(), inplace=True)

print("Cleaned Pima Diabetes Dataset Preview:")
display(df_diabetes.head())

# Save cleaned CSV
df_diabetes.to_csv("/content/cleaned_diabetes.csv", index=False)
print("Cleaned Pima Diabetes CSV saved as 'cleaned_diabetes.csv'")


Cleaned Pima Diabetes Dataset Preview:


  df_diabetes.fillna(df_diabetes.median(), inplace=True)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


Cleaned Pima Diabetes CSV saved as 'cleaned_diabetes.csv'


In [12]:
# ----------------------------
# Cell 6: Load and Understand Healthcare Dataset
# ----------------------------
healthcare_folder = "/content/healthcare"
for file in os.listdir(healthcare_folder):
    if file.endswith(".csv"):
        healthcare_csv_path = os.path.join(healthcare_folder, file)
        df_healthcare = pd.read_csv(healthcare_csv_path)
        print(f"\nHealthcare Dataset ({file}) Preview:")
        display(df_healthcare.head())
        print("\nDataset Info:")
        df_healthcare.info()

        # Check missing values
        print("\nMissing Values Count:")
        print(df_healthcare.isnull().sum())



Healthcare Dataset (healthcare_dataset.csv) Preview:


Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
me

In [13]:
# ----------------------------
# Cell 7: Clean Healthcare Dataset
# ----------------------------
# Fill missing values: median for numeric, mode for categorical
for col in df_healthcare.columns:
    if df_healthcare[col].dtype in ['int64','float64']:
        df_healthcare[col].fillna(df_healthcare[col].median(), inplace=True)
    else:
        df_healthcare[col].fillna(df_healthcare[col].mode()[0], inplace=True)

print("Cleaned Healthcare Dataset Preview:")
display(df_healthcare.head())

# Save cleaned CSV
cleaned_path = os.path.join("/content", "cleaned_" + file)
df_healthcare.to_csv(cleaned_path, index=False)
print(f"Cleaned Healthcare CSV saved as '{cleaned_path}'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_healthcare[col].fillna(df_healthcare[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_healthcare[col].fillna(df_healthcare[col].median(), inplace=True)


Cleaned Healthcare Dataset Preview:


Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


Cleaned Healthcare CSV saved as '/content/cleaned_healthcare_dataset.csv'


In [14]:
# ----------------------------
# Cell 8: Conclusion / Next Steps
# ----------------------------
print("✅ CSV datasets have been cleaned and saved.")
print("Next, we can perform ML analysis on the numeric data and generate insights for diet planning.")


✅ CSV datasets have been cleaned and saved.
Next, we can perform ML analysis on the numeric data and generate insights for diet planning.
