In [13]:
import numpy as np

In [14]:
import pandas as pd

In [15]:
import matplotlib.pyplot as plt

In [16]:
data=pd.read_csv("/content/Indian_Kids_Screen_Time.csv")

In [17]:
data.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


In [18]:
data.isnull()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
9707,False,False,False,False,False,False,False,False
9708,False,False,False,False,False,False,False,False
9709,False,False,False,False,False,False,False,False
9710,False,False,False,False,False,False,False,False


In [19]:
data.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
9707,False
9708,False
9709,False
9710,False


In [20]:
(data.duplicated()).sum()

np.int64(44)

In [21]:
data.corr(numeric_only=True)

Unnamed: 0,Age,Avg_Daily_Screen_Time_hr,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio
Age,1.0,0.118328,0.159173,-0.488617
Avg_Daily_Screen_Time_hr,0.118328,1.0,0.66495,-0.087552
Exceeded_Recommended_Limit,0.159173,0.66495,1.0,-0.126643
Educational_to_Recreational_Ratio,-0.488617,-0.087552,-0.126643,1.0


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                9712 non-null   int64  
 1   Gender                             9712 non-null   object 
 2   Avg_Daily_Screen_Time_hr           9712 non-null   float64
 3   Primary_Device                     9712 non-null   object 
 4   Exceeded_Recommended_Limit         9712 non-null   bool   
 5   Educational_to_Recreational_Ratio  9712 non-null   float64
 6   Health_Impacts                     6494 non-null   object 
 7   Urban_or_Rural                     9712 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 540.7+ KB


In [23]:
data["Age"].dtypes


dtype('int64')

In [24]:
numerical_data = data.select_dtypes(include=np.number)

Q1 = numerical_data.quantile(0.25)
Q3 = numerical_data.quantile(0.75)
IQR = Q3 - Q1

print("IQR values:\n", IQR)

IQR values:
 Age                                  6.00
Avg_Daily_Screen_Time_hr             1.97
Educational_to_Recreational_Ratio    0.11
dtype: float64




---



WEEK 2

In [25]:
#Create Age Bands

age_bins = [7, 10, 14, 18]
age_labels = ['Child (8-10)', 'Pre-Teen (11-14)', 'Teenager (15-18)']
data['Age_Band'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels)
print(data['Age_Band'])


0       Pre-Teen (11-14)
1       Pre-Teen (11-14)
2       Teenager (15-18)
3       Teenager (15-18)
4       Pre-Teen (11-14)
              ...       
9707    Teenager (15-18)
9708    Teenager (15-18)
9709    Teenager (15-18)
9710    Teenager (15-18)
9711    Teenager (15-18)
Name: Age_Band, Length: 9712, dtype: category
Categories (3, object): ['Child (8-10)' < 'Pre-Teen (11-14)' < 'Teenager (15-18)']


In [26]:
# Identifying the columns with missing values
missing_cols = data.columns[data.isnull().any()]
print("Columns with missing values:", list(missing_cols))

Columns with missing values: ['Health_Impacts']


In [28]:
#  Replacing the  missing values with mode of each Age_Band
for col in missing_cols:
    # For each age band
    for band in data['Age_Band'].unique():
        if pd.isnull(band):
            continue
        # Find the mode for age band
        mode_val = data.loc[data['Age_Band'] == band, col].mode()

        if not mode_val.empty:
            mode_val = mode_val[0]
            # Replacing missing values for band with its mode
            data.loc[(data['Age_Band'] == band) & (data[col].isnull()), col] = mode_val


In [29]:
# Verifying if the missing values are handled
print("\nMissing values after replacement:\n", data.isnull().sum())


Missing values after replacement:
 Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
Age_Band                             0
dtype: int64


In [31]:
# Saving the updated dataset in a CSV file
data.to_csv("AgeBand_Mode_Filled_Dataset.csv", index=False)
print("\n The Cleaned Dataset is saved as 'AgeBand_Mode_Filled_Dataset.csv'")


 The Cleaned Dataset is saved as 'AgeBand_Mode_Filled_Dataset.csv'


In [32]:
data.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Age_Band
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban,Pre-Teen (11-14)
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban,Pre-Teen (11-14)
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban,Teenager (15-18)
3,15,Female,1.21,Laptop,False,0.39,Poor Sleep,Urban,Teenager (15-18)
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban,Pre-Teen (11-14)


**Missing values were treated using a **group-wise mode imputation technique**, where the data was segmented by **Age\_Band**, and missing entries were replaced with the most common value within each group to maintain contextual accuracy.

The resulting cleaned dataset was stored as **'AgeBand\_Mode\_Filled\_Dataset.csv'**, making it ready for further analysis, modeling, or reporting tasks.
**

---

