WEEK 2

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
data=pd.read_csv("/content/Indian_Kids_Screen_Time.csv")

In [5]:
data.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


In [6]:
#Create Age Bands

age_bins = [7, 10, 14, 18]
age_labels = ['Child (8-10)', 'Pre-Teen (11-14)', 'Teenager (15-18)']
data['Age_Band'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)
print(data['Age_Band'])


0       Pre-Teen (11-14)
1       Pre-Teen (11-14)
2       Teenager (15-18)
3       Teenager (15-18)
4       Pre-Teen (11-14)
              ...       
9707    Teenager (15-18)
9708    Teenager (15-18)
9709    Teenager (15-18)
9710    Teenager (15-18)
9711    Teenager (15-18)
Name: Age_Band, Length: 9712, dtype: category
Categories (3, object): ['Child (8-10)' < 'Pre-Teen (11-14)' < 'Teenager (15-18)']


In [7]:
# Identifying the columns with missing values
missing_cols = data.columns[data.isnull().any()]
print("Columns with missing values:", list(missing_cols))

Columns with missing values: ['Health_Impacts']


In [8]:
#  Replacing the  missing values with mode of each Age_Band
for col in missing_cols:
    # For each age band
    for band in data['Age_Band'].unique():
        if pd.isnull(band):
            continue
        # Find the mode for age band
        mode_val = data.loc[data['Age_Band'] == band, col].mode()

        if not mode_val.empty:
            mode_val = mode_val[0]
            # Replacing missing values for band with its mode
            data.loc[(data['Age_Band'] == band) & (data[col].isnull()), col] = mode_val


In [9]:
# Verifying if the missing values are handled
print("\nMissing values after replacement:\n", data.isnull().sum())


Missing values after replacement:
 Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
Age_Band                             0
dtype: int64


In [10]:
#Age Bands
def Age_group(age):
    if 8 <= age <= 12:
        return "Pre-Teens"
    elif 13 <= age <= 16:
        return "Teenagers"
    elif 17 <= age <= 18:
        return "Late Teens"
    else:
        return "None"

data['Age_group'] = data['Age'].apply(Age_group)
data[['Age', 'Age_group']].head(5)

Unnamed: 0,Age,Age_group
0,14,Teenagers
1,11,Pre-Teens
2,18,Late Teens
3,15,Teenagers
4,12,Pre-Teens


In [11]:
# Saving the updated dataset in a CSV file
data.to_csv("AgeBand_Mode_Filled_Dataset.csv", index=False)
print("\n The Cleaned Dataset is saved as 'AgeBand_Mode_Filled_Dataset.csv'")


 The Cleaned Dataset is saved as 'AgeBand_Mode_Filled_Dataset.csv'


In [12]:
data.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Age_Band,Age_group
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban,Pre-Teen (11-14),Teenagers
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban,Pre-Teen (11-14),Pre-Teens
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban,Teenager (15-18),Late Teens
3,15,Female,1.21,Laptop,False,0.39,Poor Sleep,Urban,Teenager (15-18),Teenagers
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban,Pre-Teen (11-14),Pre-Teens


**Missing values were treated using a **group-wise mode imputation technique**, where the data was segmented by **Age\_Band**, and missing entries were replaced with the most common value within each group to maintain contextual accuracy.

The resulting cleaned dataset was stored as **'AgeBand\_Mode\_Filled\_Dataset.csv'**, making it ready for further analysis, modeling, or reporting tasks.
**

---



In [13]:
# Device type: Fixed or Portable
def device_type(device):
    if device in ["Smartphone", "Tablet", "Laptop"]:
        return "Portable"
    elif device == "TV":
        return "Fixed"
    else:
        return "Other"

data['Device_Type'] = data['Primary_Device'].apply(device_type)
data[['Primary_Device', 'Device_Type']].head(5)

Unnamed: 0,Primary_Device,Device_Type
0,Smartphone,Portable
1,Laptop,Portable
2,TV,Fixed
3,Laptop,Portable
4,Smartphone,Portable


In [14]:
# Screen size: TV >=30", ['Smartphone', 'Laptop', 'Tablet'] <30"
def screen_size(device):
    if device == 'TV':
        return '>=30'
    elif device in ['Smartphone', 'Laptop', 'Tablet']:
        return '<30'
    else:
        return None
data['Screen_Size'] = data['Primary_Device'].apply(screen_size)
data[['Primary_Device', 'Device_Type', 'Screen_Size']].head(5)

Unnamed: 0,Primary_Device,Device_Type,Screen_Size
0,Smartphone,Portable,<30
1,Laptop,Portable,<30
2,TV,Fixed,>=30
3,Laptop,Portable,<30
4,Smartphone,Portable,<30


In [15]:
# Categorize health impacts: Mental, Physical, Both, or No Health Impacts
def health_status(impact):
    impact = str(impact).lower()

    if "no health impacts" in impact:
        return "No Health Impacts"

    mental = ["anxiety", "poor sleep"]
    physical = ["eye strain", "obesity risk"]

    has_mental = any(m in impact for m in mental)
    has_physical = any(p in impact for p in physical)

    if has_mental and has_physical:
        return "Mental, Physical"
    elif has_mental:
        return "Mental"
    elif has_physical:
        return "Physical"
    else:
        return "Unknown" # Added to handle cases with no matching keywords

data['Health_Status'] = data['Health_Impacts'].apply(health_status)
data[['Health_Impacts','Health_Status']].head(10)

Unnamed: 0,Health_Impacts,Health_Status
0,"Poor Sleep, Eye Strain","Mental, Physical"
1,Poor Sleep,Mental
2,Poor Sleep,Mental
3,Poor Sleep,Mental
4,"Poor Sleep, Anxiety",Mental
5,Poor Sleep,Mental
6,Poor Sleep,Mental
7,Poor Sleep,Mental
8,"Poor Sleep, Anxiety",Mental
9,"Poor Sleep, Obesity Risk","Mental, Physical"


In [16]:
# Save as CSV
data.to_csv("preprocessed_kids_screen_time.csv", index=False)
print("Dataset saved successfully!")

Dataset saved successfully!


Week 2 Summary – Data Preprocessing

Resolved Inconsistent Entries
Replaced the value "None" in Health_Impacts with "No Health Impacts" to maintain clarity and consistency.

Created Derived Features

Age Groups:
8–12 → Pre-Teens
13–16 → Teenagers
17–18 → Late Teens

Device Classification:
Portable → Smartphone, Tablet, Laptop
Fixed → Television

Health Categories:
Mental → anxiety, poor sleep
Physical → eye strain, obesity risk
Mental + Physical → both types of effects present

Introduced New Column
Added Screen_Size field:

Portable devices → less than 30 inches

Fixed devices (TV) → 30 inches or larger

Rearranged Column Order for Readability

Age_Group placed next to Age

Device_Type and Screen_Size positioned beside Primary_Device

Health_Status aligned next to Health_Impacts

Saved Processed Dataset for reuse in later stages.