In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

DATA_PATH = "../data/raw/synthetic_coffee_health_10000.csv"
df = pd.read_csv(DATA_PATH)

print("Initial shape:", df.shape)
print(df.info())
print(df.head())

Initial shape: (10010, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       10010 non-null  int64  
 1   Age                      10009 non-null  float64
 2   Gender                   10008 non-null  object 
 3   Country                  10008 non-null  object 
 4   Coffee_Intake            10009 non-null  float64
 5   Caffeine_mg              10008 non-null  float64
 6   Sleep_Hours              10008 non-null  float64
 7   Sleep_Quality            10009 non-null  object 
 8   BMI                      10009 non-null  float64
 9   Heart_Rate               10009 non-null  float64
 10  Stress_Level             10008 non-null  object 
 11  Physical_Activity_Hours  10009 non-null  float64
 12  Health_Issues            4066 non-null   object 
 13  Occupation               10009 non-null  object 


In [2]:
print("\nMissing values per column:")
print(df.isnull().sum())

df = df.dropna(how="all")  
df = df.fillna(df.median(numeric_only=True))  
df = df.fillna(df.mode().iloc[0])             



Missing values per column:
ID                            0
Age                           1
Gender                        2
Country                       2
Coffee_Intake                 1
Caffeine_mg                   2
Sleep_Hours                   2
Sleep_Quality                 1
BMI                           1
Heart_Rate                    1
Stress_Level                  2
Physical_Activity_Hours       1
Health_Issues              5944
Occupation                    1
Smoking                       1
Alcohol_Consumption           1
dtype: int64


In [3]:
print("\nCategorical Columns Unique Values:")
print("="*40)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    if df[col].nunique() <= 10:
        print(f"  Values: {df[col].unique()}")


Categorical Columns Unique Values:
Gender: 3 unique values
  Values: ['Male' 'Female' 'Other']
Country: 20 unique values
Sleep_Quality: 4 unique values
  Values: ['Good' 'Fair' 'Excellent' 'Poor']
Stress_Level: 3 unique values
  Values: ['Low' 'Medium' 'High']
Health_Issues: 3 unique values
  Values: ['Mild' 'Moderate' 'Severe']
Occupation: 5 unique values
  Values: ['Other' 'Service' 'Office' 'Student' 'Healthcare']


In [4]:
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

Duplicates before: 2
Duplicates after: 0


In [5]:
if "health_risk" in df.columns:
    df["health_risk"] = df["health_risk"].astype("category")

In [6]:
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    df[col] = np.where(df[col] < lower, lower,
              np.where(df[col] > upper, upper, df[col]))

In [7]:
label_encoders = {}
for col in df.select_dtypes(include="category").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [8]:
scaler = MinMaxScaler()
scaled_cols = df.select_dtypes(include=np.number).columns.drop("health_risk") \
               if "health_risk" in df.columns else df.select_dtypes(include=np.number).columns
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])

df_clean = df.copy()

In [9]:
if "Health_Issues" in df_clean.columns:
    le_target = LabelEncoder()
    df_clean['Health_Issues_encoded'] = le_target.fit_transform(df_clean['Health_Issues'])
    print("Health_Issues encoding:")
    for i, val in enumerate(le_target.classes_):
        print(f"  {val}: {i}")

Health_Issues encoding:
  Mild: 0
  Moderate: 1
  Severe: 2


In [10]:


# # Check for duplicates
# print(f"Duplicate rows: {df_clean.duplicated().sum()}")

# # Remove duplicates if any
# df_clean = df_clean.drop_duplicates()
# print(f"Dataset shape after removing duplicates: {df_clean.shape}")

# # Check for outliers in numerical columns
# numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
# print("\nOutlier Analysis:")
# print("="*30)

# for col in numerical_cols:
#     Q1 = df_clean[col].quantile(0.25)
#     Q3 = df_clean[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
#     print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(df_clean)*100:.2f}%)")


In [11]:
# # Handle categorical variables - encode them
# le_dict = {}

# for col in categorical_cols:
#     if col != 'Health_Issues':  # Don't encode target variable yet
#         le = LabelEncoder()
#         df_clean[col + '_encoded'] = le.fit_transform(df_clean[col])
#         le_dict[col] = le
#         print(f"{col} encoding:")
#         for i, val in enumerate(le.classes_):
#             print(f"  {val}: {i}")
#         print()

# # Encode target variable
# le_target = LabelEncoder()
# df_clean['Health_Issues_encoded'] = le_target.fit_transform(df_clean['Health_Issues'])
# print("Health_Issues encoding:")
# for i, val in enumerate(le_target.classes_):
#     print(f"  {val}: {i}")


In [12]:
OUTPUT_PATH = "../data/processed/coffee_health_cleaned.csv"
df.to_csv(OUTPUT_PATH, index=False)

print(f"\nCleaned dataset saved to {OUTPUT_PATH}")
print("Final shape:", df.shape)


Cleaned dataset saved to ../data/processed/coffee_health_cleaned.csv
Final shape: (10008, 16)
