In [1]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path


file_path = Path(r"D:\Infosys SpringBoard\data kaggle\Indian_Kids_Screen_Time.csv")
df = pd.read_csv(file_path)


print("Shape:", df.shape)
display(df.head())
display(df.dtypes)


Shape: (9712, 8)


Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


Age                                    int64
Gender                                object
Avg_Daily_Screen_Time_hr             float64
Primary_Device                        object
Exceeded_Recommended_Limit              bool
Educational_to_Recreational_Ratio    float64
Health_Impacts                        object
Urban_or_Rural                        object
dtype: object

In [2]:
# Remove exact duplicate rows
df = df.drop_duplicates(keep='first')
print("After dropping exact duplicates, shape:", df.shape)
df.head()

After dropping exact duplicates, shape: (9668, 8)


Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


In [3]:
#Trim whitespace and normalize string columns
str_cols = df.select_dtypes(include=['object']).columns.tolist()
for c in str_cols:
    df[c] = df[c].astype(str).str.strip().replace({'nan': np.nan})
    df[c] = df[c].where(df[c].isna(), df[c].str.lower())


if 'gender' in df.columns:
    gender_map = {
        'boy': 'male', 'm': 'male', 'male': 'male',
        'girl': 'female', 'f': 'female', 'female': 'female'
    }
    df['gender'] = df['gender'].map(gender_map).fillna(df['gender'])  # keep others unchanged
    print("Gender value counts after mapping:")
    display(df['gender'].value_counts(dropna=False))
df.head(20)

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,male,3.99,smartphone,True,0.42,"poor sleep, eye strain",urban
1,11,female,4.61,laptop,True,0.3,poor sleep,urban
2,18,female,3.73,tv,True,0.32,poor sleep,urban
3,15,female,1.21,laptop,False,0.39,,urban
4,12,female,5.89,smartphone,True,0.49,"poor sleep, anxiety",urban
5,14,female,4.88,smartphone,True,0.44,poor sleep,urban
6,17,male,2.97,tv,False,0.48,,rural
7,10,male,2.74,tv,True,0.54,,urban
8,14,male,4.61,laptop,True,0.36,"poor sleep, anxiety",rural
9,18,male,3.24,tablet,True,0.48,"poor sleep, obesity risk",urban


# Drop the rows having Avg_Daily_Screen_Time_hr=0

In [4]:
df = df[df["Avg_Daily_Screen_Time_hr"] != 0]

print("New shape:", df.shape)
display(df.head(20))

New shape: (9474, 8)


Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,male,3.99,smartphone,True,0.42,"poor sleep, eye strain",urban
1,11,female,4.61,laptop,True,0.3,poor sleep,urban
2,18,female,3.73,tv,True,0.32,poor sleep,urban
3,15,female,1.21,laptop,False,0.39,,urban
4,12,female,5.89,smartphone,True,0.49,"poor sleep, anxiety",urban
5,14,female,4.88,smartphone,True,0.44,poor sleep,urban
6,17,male,2.97,tv,False,0.48,,rural
7,10,male,2.74,tv,True,0.54,,urban
8,14,male,4.61,laptop,True,0.36,"poor sleep, anxiety",rural
9,18,male,3.24,tablet,True,0.48,"poor sleep, obesity risk",urban


# deriving new fields

In [5]:
# Calculate Recreational and Educational screen time in two columns
df["Recreational_Time_hr"] = df["Avg_Daily_Screen_Time_hr"] / (df["Educational_to_Recreational_Ratio"] + 1)
df["Educational_Time_hr"] = df["Avg_Daily_Screen_Time_hr"] - df["Recreational_Time_hr"]

df["Recreational_Time_hr"] = df["Recreational_Time_hr"].round(2)
df["Educational_Time_hr"] = df["Educational_Time_hr"].round(2)

df.head()


Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Recreational_Time_hr,Educational_Time_hr
0,14,male,3.99,smartphone,True,0.42,"poor sleep, eye strain",urban,2.81,1.18
1,11,female,4.61,laptop,True,0.3,poor sleep,urban,3.55,1.06
2,18,female,3.73,tv,True,0.32,poor sleep,urban,2.83,0.9
3,15,female,1.21,laptop,False,0.39,,urban,0.87,0.34
4,12,female,5.89,smartphone,True,0.49,"poor sleep, anxiety",urban,3.95,1.94


# Changing datatype of Screen time into hh:mm form

In [6]:

def decimal_hours_to_hhmm(decimal_hours):
    if pd.isna(decimal_hours):
        return np.nan
    try:
        total_minutes = int(round(decimal_hours * 60))
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours:d}:{minutes:02d}"
    except Exception:
        return np.nan

if 'Avg_Daily_Screen_Time_hr' in df.columns:
    
    original_values = df['Avg_Daily_Screen_Time_hr'].copy()
    
    
    df['Avg_Daily_Screen_Time_hr'] = df['Avg_Daily_Screen_Time_hr'].apply(decimal_hours_to_hhmm)
    
    
    print("Screen time converted from decimal hours to hh:mm format:")
    display(df[['Avg_Daily_Screen_Time_hr']].head(10))
    
 

Screen time converted from decimal hours to hh:mm format:


Unnamed: 0,Avg_Daily_Screen_Time_hr
0,3:59
1,4:37
2,3:44
3,1:13
4,5:53
5,4:53
6,2:58
7,2:44
8,4:37
9,3:14


In [7]:
age_bins = [0, 5, 8, 11, 14, 18]
age_labels = ['0–5', '6–8', '9–11', '12–14', '15–18']


df['Age_Band'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=True)
df.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Recreational_Time_hr,Educational_Time_hr,Age_Band
0,14,male,3:59,smartphone,True,0.42,"poor sleep, eye strain",urban,2.81,1.18,12–14
1,11,female,4:37,laptop,True,0.3,poor sleep,urban,3.55,1.06,9–11
2,18,female,3:44,tv,True,0.32,poor sleep,urban,2.83,0.9,15–18
3,15,female,1:13,laptop,False,0.39,,urban,0.87,0.34,15–18
4,12,female,5:53,smartphone,True,0.49,"poor sleep, anxiety",urban,3.95,1.94,12–14


In [8]:

print("Final shape:", df.shape)
print("Final missing counts:")
display(df.isna().sum().sort_values(ascending=False).head(30))

#  Save cleaned dataset
clean_path = file_path.parent / (file_path.stem + "_week2.csv")
df.to_csv(clean_path, index=False)
print("Cleaned file saved to:", clean_path)


Final shape: (9474, 11)
Final missing counts:


Health_Impacts                       2986
Gender                                  0
Age                                     0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Urban_or_Rural                          0
Recreational_Time_hr                    0
Educational_Time_hr                     0
Age_Band                                0
dtype: int64

Cleaned file saved to: D:\Infosys SpringBoard\data kaggle\Indian_Kids_Screen_Time_week2.csv
