# Data Cleaning & Preprocessing

## Objective
Clean and preprocess the raw lifestyle dataset to make it suitable for EDA and machine learning modeling.

**Output:** `cleaned_data.csv`

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

## Load Dataset

In [None]:
# Update path if needed
df = pd.read_csv('raw_data.csv')
df.head()

## Basic Dataset Inspection

In [None]:
df.info()

In [None]:
df.describe()

## Handling Data Types

In [None]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Convert Wake_Up_Time to datetime time
df['Wake_Up_Time'] = pd.to_datetime(df['Wake_Up_Time'], format='%H:%M').dt.time

df.dtypes

## Missing Values & Duplicates

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

## Drop Non-Predictive Column

In [None]:
# User_ID is an identifier, not useful for prediction
df.drop(columns=['User_ID'], inplace=True)

## Outlier Treatment (Capping using IQR)

In [None]:
numerical_cols = ['Sleep_Hours', 'Steps', 'Calories_Burned', 'Water_Intake_ml', 'Study_Hours']

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)

df.describe()

## Final Cleaned Dataset

In [None]:
df.head()

## Export Cleaned Data

In [None]:
df.to_csv('cleaned_data.csv', index=False)
print('cleaned_data.csv exported successfully')