In [30]:
df = pd.read_csv("Indian_Kids_Screen_Time.csv")

In [31]:
# 1. Handle Missing Values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Avg_Daily_Screen_Time_hr'] = df['Avg_Daily_Screen_Time_hr'].fillna(df['Avg_Daily_Screen_Time_hr'].median())
df['Educational_to_Recreational_Ratio'] = df['Educational_to_Recreational_Ratio'].fillna(df['Educational_to_Recreational_Ratio'].median())

In [13]:
# Categorical columns: fill with 'Unknown'
for col in ['Gender', 'Primary_Device', 'Urban_or_Rural', 'Health_Impacts', 'Exceeded_Recommended_Limit']:
    df[col] = df[col].fillna('Unknown')

In [14]:
# 2. Create Derived Fields
#  Age Bands
def age_band(age):
    if age <= 5:
        return '0-5'
    elif age <= 12:
        return '6-12'
    elif age <= 17:
        return '13-17'
    else:
        return '18+'

df['Age_Band'] = df['Age'].apply(age_band)

In [27]:
# 2 Day Type (Weekday/Weekend) - simulated, since dataset has no actual dates
np.random.seed(42)
df['Day_Type'] = np.random.choice(['Weekday', 'Weekend'], size=len(df))


In [16]:
# 2.3 Activity Type (Educational vs Recreational Dominant)
def classify_activity(ratio):
    if ratio >= 1:
        return 'Educational Dominant'
    else:
        return 'Recreational Dominant'

df['Activity_Type'] = df['Educational_to_Recreational_Ratio'].apply(classify_activity)


In [17]:
# 3. Format / Clean Columns
df.columns = df.columns.str.strip()

# Make categorical columns lowercase (optional)
cat_cols = ['Gender', 'Primary_Device', 'Urban_or_Rural', 'Health_Impacts', 'Exceeded_Recommended_Limit', 'Age_Band', 'Day_Type', 'Activity_Type']
for col in cat_cols:
    df[col] = df[col].astype(str).str.title()

In [18]:
# 4. Save Preprocessed Data
df.to_csv("indian_kids_screentime_cleaned.csv", index=False)

In [21]:
# 5. Preprocessing Summary
df.shape

(9665, 11)

In [22]:
df.dtypes

Age                                    int64
Gender                                object
Avg_Daily_Screen_Time_hr             float64
Primary_Device                        object
Exceeded_Recommended_Limit            object
Educational_to_Recreational_Ratio    float64
Health_Impacts                        object
Urban_or_Rural                        object
Age_Band                              object
Day_Type                              object
Activity_Type                         object
dtype: object

In [23]:
df.isnull().sum()

Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
Age_Band                             0
Day_Type                             0
Activity_Type                        0
dtype: int64

In [24]:
df.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Age_Band,Day_Type,Activity_Type
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban,13-17,Weekday,Recreational Dominant
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban,6-12,Weekend,Recreational Dominant
2,18,Female,3.73,Tv,True,0.32,Poor Sleep,Urban,18+,Weekday,Recreational Dominant
3,15,Female,1.21,Laptop,False,0.39,Poor Sleep,Urban,13-17,Weekday,Recreational Dominant
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban,6-12,Weekday,Recreational Dominant


In [25]:
# 6. Feature Dictionary
feature_dict = {
    'Age': 'Child age in years',
    'Gender': 'Child gender',
    'Avg_Daily_Screen_Time_hr': 'Average daily screentime in hours',
    'Primary_Device': 'Main device used for screentime',
    'Exceeded_Recommended_Limit': 'Yes/No flag if daily limit exceeded',
    'Educational_to_Recreational_Ratio': 'Ratio of educational vs recreational screentime',
    'Health_Impacts': 'Reported health impacts (Yes/No/Unknown)',
    'Urban_or_Rural': 'Urban or Rural location',
    'Age_Band': 'Derived age group (0-5, 6-12, 13-17, 18+)',
    'Day_Type': 'Derived: Weekday or Weekend',
    'Activity_Type': 'Derived: Educational Dominant or Recreational Dominant'
}

feature_dict

{'Age': 'Child age in years',
 'Gender': 'Child gender',
 'Avg_Daily_Screen_Time_hr': 'Average daily screentime in hours',
 'Primary_Device': 'Main device used for screentime',
 'Exceeded_Recommended_Limit': 'Yes/No flag if daily limit exceeded',
 'Educational_to_Recreational_Ratio': 'Ratio of educational vs recreational screentime',
 'Health_Impacts': 'Reported health impacts (Yes/No/Unknown)',
 'Urban_or_Rural': 'Urban or Rural location',
 'Age_Band': 'Derived age group (0-5, 6-12, 13-17, 18+)',
 'Day_Type': 'Derived: Weekday or Weekend',
 'Activity_Type': 'Derived: Educational Dominant or Recreational Dominant'}