In [108]:
import pandas as pd
import numpy as np

In [109]:
#Load dataset
df = pd.read_csv("Indian_Kids_Screen_Time.csv")

In [110]:
#checking for missing values
print("Missing values before cleaning:\n",df.isnull().sum())

Missing values before cleaning:
 Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64


In [111]:
#Fill missing values with column mode in categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df.fillna({col:df[col].mode()[0]},inplace=True)

In [112]:
df['Health_Impacts']=df.apply(
    lambda row: 'No Impact' if row['Avg_Daily_Screen_Time_hr']==0 else row['Health_Impacts'],axis=1
)

In [113]:
print("Missing Values after cleaning:\n",df.isnull().sum())

Missing Values after cleaning:
 Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
dtype: int64


In [114]:
#Handle inconsistent text data (fixing capitalization and remove unwanted spaces)
text_cols = ['Primary_Device','Exceeded_Recommended_Limit','Health_Impacts','Urban_or_Rural']
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.title()

In [115]:
#Creating derived features
#1)Age Band
bins=[7,10,13,15,18]
labels=['8_to_10','11_to_13','14_to_15','16_to_18']
df['Age_Band'] = pd.cut(df['Age'],bins=bins,labels=labels,right=True)

#2)Screen time level
df['Usage_Level'] = pd.cut(
    df['Avg_Daily_Screen_Time_hr'],
    bins=[-0.1,2,5,8,12,15],
    labels=['Low','Moderate','High','Very High','Extreme']
)

#3)Urban/Rural
df['Is_Urban']=df['Urban_or_Rural'].apply(lambda x: 1 if x.lower() == 'urban' else 0)

#4)Activity shares
# Recreational screen time
df['Recreational_Screen_Time'] = df['Avg_Daily_Screen_Time_hr'] / (1 + df['Educational_to_Recreational_Ratio'])

# Educational screen time
df['Educational_Screen_Time'] = df['Avg_Daily_Screen_Time_hr'] - df['Recreational_Screen_Time']

df['Educational_Share'] = df.apply(
    lambda row: (row['Educational_Screen_Time'] / row['Avg_Daily_Screen_Time_hr']) * 100
    if row['Educational_Screen_Time'] != 0 and row['Recreational_Screen_Time'] != 0 else 0,
    axis=1
)

df['Recreational_Share'] = df.apply(
    lambda row: (row['Recreational_Screen_Time'] / row['Avg_Daily_Screen_Time_hr']) * 100
    if row['Educational_Screen_Time'] != 0 and row['Recreational_Screen_Time'] != 0 else 0,
    axis=1
)


In [116]:
#Preview
print("Preview of derived columns:")
print(df[['Age','Age_Band','Avg_Daily_Screen_Time_hr','Usage_Level','Urban_or_Rural','Is_Urban']].head(7))

Preview of derived columns:
   Age  Age_Band  Avg_Daily_Screen_Time_hr Usage_Level Urban_or_Rural  \
0   14  14_to_15                      3.99    Moderate          Urban   
1   11  11_to_13                      4.61    Moderate          Urban   
2   18  16_to_18                      3.73    Moderate          Urban   
3   15  14_to_15                      1.21         Low          Urban   
4   12  11_to_13                      5.89        High          Urban   
5   14  14_to_15                      4.88    Moderate          Urban   
6   17  16_to_18                      2.97    Moderate          Rural   

   Is_Urban  
0         1  
1         1  
2         1  
3         1  
4         1  
5         1  
6         0  


In [117]:
#saving cleaned dataset
df.to_csv('Cleaned_Kids_ScreenTime.csv',index=False)
print("cleaned dataset saved succesfully")

cleaned dataset saved succesfully
