In [None]:
#1. Data Overview
# This dataset contains information about daily water intake and its related factors in individuals.

In [1]:
#2. Load dataset and import necessary libraries
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
#Data source
df = pd.read_csv('../data/raw_Daily_Water_Intake.csv')


In [6]:
#3. Database Shape - Checking the dataset to understand its size and structure
df.shape 

# Analysis
# With 30000 rows and 7 columns, this data set offers a sufficiently large sample size that is suitable for exploratory data analysis (EDA) and pattern identification

(30000, 7)

In [None]:
#4. Data Types & Schema - Get data type information and potential data quality issues
df.info()
df.dtypes

# Analysis
# This dataset contains no missing value, and all columns have appropriate datatypes. This indicate a good data quality and ensuring the dataset is suitable for further analysis without immidiate preprocessing


In [None]:
#5. Table Overview - Get 5 sample rows from the dataframe
df.head()

# Analysis
# The first 5 sample shows a quick overviews of the dataset structure and contents. Every row appear to have a readable value and no obvious issues

Unnamed: 0,Age,Gender,Weight (kg),Daily Water Intake (liters),Physical Activity Level,Weather,Hydration Level
0,56,Male,96,4.23,Moderate,Hot,Good
1,60,Male,105,3.95,High,Normal,Good
2,36,Male,68,2.39,Moderate,Cold,Good
3,19,Female,74,3.13,Moderate,Hot,Good
4,38,Male,77,2.11,Low,Normal,Poor


In [None]:
#6. Column Descriptions - Based on df.head(), df.info() and df.types
df_columnDescription = pd.DataFrame({ "Column Name": [
        "Age",
        "Gender",
        "Weight",
        "Daily Water Intake",
        "Physical Activity Level",
        "Weather",
        "Hydration Level"
    ],
    "Description": [
        "Age of each person",
        "Gender of each person",
        "Weight of each person in kilograms (kg)",
        "Daily water consumption in liters",
        "Physical activity level (Low, Moderate, High)",
        "Weather condition (Hot, Normal, Cold)",
        "Hydration status (Good, Poor)"
    ]}
    )
print(df_columnDescription.to_markdown(tablefmt="grid"))

# Analysis
# This dataset contains demographic, behavorial and environmental features which allow for exploratory and relationship analysis

+----+-------------------------+-----------------------------------------------+
|    | Column Name             | Description                                   |
|  0 | Age                     | Age of each person                            |
+----+-------------------------+-----------------------------------------------+
|  1 | Gender                  | Gender of each person                         |
+----+-------------------------+-----------------------------------------------+
|  2 | Weight                  | Weight of each person in kilograms (kg)       |
+----+-------------------------+-----------------------------------------------+
|  3 | Daily Water Intake      | Daily water consumption in liters             |
+----+-------------------------+-----------------------------------------------+
|  4 | Physical Activity Level | Physical activity level (Low, Moderate, High) |
+----+-------------------------+-----------------------------------------------+
|  5 | Weather              

In [None]:
#5. Data distribution - Summaries of numerical and categorical column
df.describe()
df.describe(include='object')

# Analysis
# Both numerical and categorical variables contain valid and well-distributed values. No obvious anomalies or extreme outliers were identified during the data understanding stage indicating the data is suitable for further analysis.

Unnamed: 0,Gender,Physical Activity Level,Weather,Hydration Level
count,30000,30000,30000,30000
unique,2,3,3,2
top,Male,High,Hot,Good
freq,15032,10069,10081,23915


In [None]:
#6. Nulls - Check for missing values in each column to ensures data completeness before analysis
df.isna().sum()
df[df.isna().any(axis=1)]

# Summary of missing values
missing_summary = pd.DataFrame({
    "Number of missing values": df.isna().sum(),
    "Percentage of missing values": df.isna().mean() * 100
}).sort_values(by="Number of missing values", ascending=False)

missing_summary

# Analysis
# There are no nulls found in this dataset which reduces the risk of bias
# caused by missing data and simplifies further analysis.

Unnamed: 0,Number of missing values,Percentage of missing values
Age,0,0.0
Gender,0,0.0
Weight (kg),0,0.0
Daily Water Intake (liters),0,0.0
Physical Activity Level,0,0.0
Weather,0,0.0
Hydration Level,0,0.0


In [None]:
#7. Duplicate - Check for duplicate rows in the dataframe
df.duplicated().sum() #Assign dataframe to duplicate and sum method
# There are 338 duplicated rows

# Show duplicated rows
df[df.duplicated()] #By assigning the duplicated rows to a new dataframe
# The cleaning step will not be performed now, it will be done in the data cleaning notebook.

duplicate_summary = pd.DataFrame({
    "Number of duplicated rows" :[df.duplicated().sum()],
    "Percentage of duplicated rows": [df.duplicated().mean() * 100]
})

duplicate_summary

# Analysis
# This dataset contains 338 duplicated rows, about 1,12% of the whole data. This finding enables decision on data cleaning to prevent biased count and misleading analysis result

Unnamed: 0,Number of duplicated rows,Percentage of duplicated rows
0,338,1.126667


In [None]:
#8. Data Summary - Summary of findings from the data understanding step