In [1]:
import pandas as pd
import numpy as np


In [2]:
#loading dataset
df = pd.read_csv("C:\\Users\\devar\\OneDrive\\Desktop\\ScreenSense-kid's screentime visualization\\Indian_Kids_Screen_Time.csv")


In [3]:
print(df.isnull().sum())


Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64


In [5]:
# Handle missing values safely for future pandas versions
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])


In [18]:
#fixing inconsistent values 
df['Gender'] = df['Gender'].str.strip().str.lower()
df['Gender'] = df['Gender'].replace({
    'm': 'male',
    'f': 'female',
    'man': 'male',
    'woman': 'female'
})


In [7]:
#Feature Engineering
#1.Creating age bands
bins = [0, 18, 30, 45, 60, 100]
labels = ['Teen', 'Young Adult', 'Adult', 'Middle Age', 'Senior']
df['Age_Band'] = pd.cut(df['Age'], bins=bins, labels=labels)


In [10]:
# (b) Device Flag — Desktop/Mobile/Other
if 'Primary_Device' in df.columns:
    df['Primary_Device'] = df['Primary_Device'].str.strip().str.title()
    df['Is_Mobile_User'] = df['Primary_Device'].str.contains('Mobile', case=False)
    df['Is_Desktop_User'] = df['Primary_Device'].str.contains('Desktop', case=False)

In [11]:
# (c) Create a binary flag for exceeding screen limit
if 'Exceeded_Recommended_Limit' in df.columns:
    df['Exceeded_Recommended_Limit'] = df['Exceeded_Recommended_Limit'].replace({
        'Yes': 1, 'No': 0, 'Y': 1, 'N': 0
    })

In [12]:
# (d) Ratio-based derived feature — Educational Screen Share
if 'Educational_to_Recreational_Ratio' in df.columns:
    df['Educational_Share_%'] = df['Educational_to_Recreational_Ratio'] / (
        1 + df['Educational_to_Recreational_Ratio']
    ) * 100

In [15]:
# 4️ Save Cleaned Dataset
df.to_csv("cleaned_data.csv", index=False)
print(" Cleaned dataset saved as cleaned_data.csv")


 Cleaned dataset saved as cleaned_data.csv


In [17]:
# 5️ Preprocessing Summary
summary = {
    "Missing Value Strategy": "Filled numeric with median, categorical with mode",
    "Category Cleaning": "Standardized Gender and Urban/Rural text cases",
    "Derived Features": [
        "ScreenTime_Band from Avg_Daily_Screen_Time_hr",
        "Is_Mobile_User, Is_Desktop_User from Primary_Device",
        "Educational_Share_% from Educational_to_Recreational_Ratio"
    ],
    "Date Formatting": "Not applicable (no date columns)"
}

pd.DataFrame(list(summary.items()), columns=["Step", "Description"]).to_csv("preprocessing_summary.csv", index=False)
print(" Preprocessing summary saved as preprocessing_summary.csv")

 Preprocessing summary saved as preprocessing_summary.csv


In [19]:
# 6️ Feature Dictionary
feature_dict = {
    "ScreenTime_Band": "Categorical variable dividing screen time into 5 levels",
    "Is_Mobile_User": "True if the user's primary device is mobile",
    "Is_Desktop_User": "True if the user's primary device is desktop",
    "Exceeded_Recommended_Limit": "1 if screen time exceeds limit, else 0",
    "Educational_Share_%": "Percentage of screen time spent on educational activities"
}

pd.DataFrame(feature_dict.items(), columns=["Feature", "Description"]).to_csv("feature_dictionary.csv", index=False)
print("Feature dictionary saved as feature_dictionary.csv")

Feature dictionary saved as feature_dictionary.csv


In [20]:
df.isnull().sum().sum()


np.int64(0)

In [22]:
df_cleaned = pd.read_csv('Cleaned_data.csv')
df_cleaned.isnull().sum().sum()


np.int64(0)

In [24]:
df_clean = pd.read_csv("Cleaned_data.csv")

# Count total missing values
total_missing = df_clean.isnull().sum().sum()
print(f"Total Missing Values: {total_missing}\n")

# Missing values by column
missing_by_column = df_clean.isnull().sum()
print("Missing Values by Column:")
print(missing_by_column)


Total Missing Values: 0

Missing Values by Column:
Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
Age_Band                             0
Is_Mobile_User                       0
Is_Desktop_User                      0
Educational_Share_%                  0
dtype: int64
