<a href="https://colab.research.google.com/github/springboardmentor5432x/ScreenSense-Kids-Screentime-Visualization/blob/Haripriya-Mahajan-Work/Milestone1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os

In [None]:
# Load the dataset
file_path = r"/content/Indian_Kids_Screen_Time.csv"
df = pd.read_csv(file_path)

In [None]:
#Dataset size (rows x columns)
print("Dataset Shape:", df.shape)

Dataset Shape: (9712, 8)


In [None]:
# Column Names
print("Column Names:\n", df.columns.tolist())

Column Names:
 ['Age', 'Gender', 'Avg_Daily_Screen_Time_hr', 'Primary_Device', 'Exceeded_Recommended_Limit', 'Educational_to_Recreational_Ratio', 'Health_Impacts', 'Urban_or_Rural']


In [None]:
# Data types of each column
print("Data Types:\n\n", df.dtypes)

Data Types:

 Age                                    int64
Gender                                object
Avg_Daily_Screen_Time_hr             float64
Primary_Device                        object
Exceeded_Recommended_Limit              bool
Educational_to_Recreational_Ratio    float64
Health_Impacts                        object
Urban_or_Rural                        object
dtype: object


In [None]:
# Summary statistics for numeric columns
print("Numeric Summary:\n")
display(df.describe())

Numeric Summary:



Unnamed: 0,Age,Avg_Daily_Screen_Time_hr,Educational_to_Recreational_Ratio
count,9712.0,9712.0,9712.0
mean,12.979201,4.352837,0.427226
std,3.162437,1.718232,0.073221
min,8.0,0.0,0.3
25%,10.0,3.41,0.37
50%,13.0,4.44,0.43
75%,16.0,5.38,0.48
max,18.0,13.89,0.6


In [None]:
# Summary statistics for categorical columns
print("Categorical Summary:\n\n")
display(df.describe(include='object'))

Categorical Summary:




Unnamed: 0,Gender,Primary_Device,Health_Impacts,Urban_or_Rural
count,9712,9712,6494,9712
unique,2,4,15,2
top,Male,Smartphone,Poor Sleep,Urban
freq,4942,4568,2268,6851


In [None]:
# Check for null/missing values
print("Missing Values per Column:\n\n", df.isnull().sum())

Missing Values per Column:

 Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64


In [None]:
# Case 1: If limit exceeded and Health_Impacts is missing → label as 'Potential Risk'
df.loc[
    (df['Health_Impacts'].isna()) & (df['Exceeded_Recommended_Limit'] == True),
    'Health_Impacts'
] = 'Potential Risk'

# Case 2: If limit not exceeded and Health_Impacts is missing → label as 'Healthy'
df.loc[
    (df['Health_Impacts'].isna()) & (df['Exceeded_Recommended_Limit'] == False),
    'Health_Impacts'
] = 'Healthy'

# Check remaining missing values (if any)
df['Health_Impacts'].isna().sum()

np.int64(0)

In [None]:
# Fixing Inconsistent Categories
df['Gender'] = df['Gender'].str.strip().str.title()
df['Primary_Device'] = df['Primary_Device'].str.strip().str.title()
df['Urban_or_Rural'] = df['Urban_or_Rural'].str.strip().str.title()

In [None]:
# Creating age groups
def categorize_age(age):
    if 8 <= age <= 11:
        return 'Kids'
    elif 12 <= age <= 14:
        return 'Pre-Teens'
    else:
        return 'Teens'

df['Age_Group'] = df['Age'].apply(categorize_age)

# Checking Age Group distribution
print("Age Group Distribution:\n", df['Age_Group'].value_counts(), "\n")

Age Group Distribution:
 Age_Group
Kids         3540
Teens        3499
Pre-Teens    2673
Name: count, dtype: int64 



In [None]:
# Creating screen time groups
bins = [0, 2, 5, 8, 24]  # 24 just to include all possible values
labels = ["Low Usage", "Moderate Usage", "High Usage", "Very High Usage"]

df["ScreenTime_Category"] = pd.cut(df["Avg_Daily_Screen_Time_hr"], bins=bins, labels=labels, right=False)

# Check the distribution
print(df["ScreenTime_Category"].value_counts())

ScreenTime_Category
Moderate Usage     5588
High Usage         3123
Low Usage           826
Very High Usage     175
Name: count, dtype: int64


In [None]:
# Create Educational vs Recreational ratio groups
bins = [0.3, 0.4, 0.5, 0.61]
labels = ["Mostly Recreational", "Balanced Usage", "Mostly Educational"]

df["EduRec_Category"] = pd.cut(df["Educational_to_Recreational_Ratio"], bins=bins, labels=labels, right=False)

# Check the distribution
print(df["EduRec_Category"].value_counts())

EduRec_Category
Balanced Usage         4726
Mostly Recreational    3389
Mostly Educational     1597
Name: count, dtype: int64


In [None]:
# One-Hot Encoding for Health_Impacts column

# Split comma-separated values into lists
df["Health_Impacts_List"] = df["Health_Impacts"].apply(lambda x: [i.strip() for i in x.split(",")])

# Use MultiLabelBinarizer to create one-hot encoded columns
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
health_encoded = pd.DataFrame(
    mlb.fit_transform(df["Health_Impacts_List"]),
    columns=[f"HealthImpact_{c}" for c in mlb.classes_]
)

# Combine the new one-hot columns with the original dataframe
df = pd.concat([df, health_encoded], axis=1)

# Drop the helper column
df.drop(columns=["Health_Impacts_List"], inplace=True)

In [None]:
# Check for remaining nulls
print(df.isnull().sum(), "\n")

Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
Age_Group                            0
ScreenTime_Category                  0
EduRec_Category                      0
HealthImpact_Anxiety                 0
HealthImpact_Eye Strain              0
HealthImpact_Healthy                 0
HealthImpact_Obesity Risk            0
HealthImpact_Poor Sleep              0
HealthImpact_Potential Risk          0
dtype: int64 



In [None]:
# Creating the directory
os.makedirs("data/processed", exist_ok=True)

# Save the cleaned data to a new processed file
df.to_csv("data/processed/indian_kids_screentime_cleaned.csv", index=False)