Week 2 : Preprocessing and Feature Engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the Dataset

file_path = "D:\Indian_Kids_Screen_Time.csv"  
df = pd.read_csv(file_path)
print(" Dataset loaded successfully for preprocessing.")
print("Shape:", df.shape)

 Dataset loaded successfully for preprocessing.
Shape: (9712, 8)


In [3]:
# Handle Missing / Inconsistent Values

# Trim whitespace and standardize text casing
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.title()

In [4]:
# Check for missing values

print("\n Missing values before handling:\n", df.isnull().sum())



 Missing values before handling:
 Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64


In [5]:
# (If any missing values exist)

df.fillna({
    'Primary_Device': 'Unknown',
    'Health_Impacts': 'None'
}, inplace=True)


In [6]:
# Handle Inconsistent Categories

# Standardize gender categories 
df['Gender'] = df['Gender'].replace({
    'M': 'Male', 'F': 'Female', 'male': 'Male', 'female': 'Female'
})


In [7]:
# Standardize Urban/Rural field
df['Urban_or_Rural'] = df['Urban_or_Rural'].replace({
    'urban': 'Urban', 'rural': 'Rural'
})

In [8]:
# Feature Engineering (Derived Columns)

# Age Band
bins = [7, 10, 13, 16, 18]
labels = ['Child (8-10)', 'Pre-Teen (11-13)', 'Teen (14-16)', 'Young Adult (17-18)']
df['Age_Band'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

In [9]:
# ScreenTime Level
def categorize_screen_time(x):
    if x < 3:
        return 'Low (<3 hr)'
    elif 3 <= x <= 6:
        return 'Moderate (3–6 hr)'
    else:
        return 'High (>6 hr)'

df['ScreenTime_Level'] = df['Avg_Daily_Screen_Time_hr'].apply(categorize_screen_time)

In [10]:
# Education vs Recreation Score (Percentage)
df['Edu_Recreational_%'] = (df['Educational_to_Recreational_Ratio'] * 100).round(1)


In [11]:
# Health Impact Count
df['Health_Impact_Count'] = df['Health_Impacts'].apply(lambda x: len(x.split(',')) if x != 'None' else 0)


In [12]:
# Urban/Rural Binary Flag
df['Urban_Rural_Flag'] = df['Urban_or_Rural'].map({'Urban': 1, 'Rural': 0})

In [13]:
# Overuse Index (custom metric)
df['Overuse_Index'] = (df['Avg_Daily_Screen_Time_hr'] * (1 - df['Educational_to_Recreational_Ratio'])).round(2)

In [14]:
# Risk Category (based on Overuse and Health Issues)
def risk_level(row):
    if row['Overuse_Index'] > 4.5 and row['Health_Impact_Count'] >= 2:
        return 'High Risk'
    elif row['Overuse_Index'] > 3 and row['Health_Impact_Count'] >= 1:
        return 'Medium Risk'
    else:
        return 'Low Risk'

df['Risk_Category'] = df.apply(risk_level, axis=1)

In [15]:
# Primary Use Category
def primary_use(row):
    device = row['Primary_Device']
    ratio = row['Educational_to_Recreational_Ratio']
    if ratio >= 0.55:
        return 'Study / Educational'
    elif 'Laptop' in device and ratio >= 0.45:
        return 'Study / Educational'
    elif 'Smartphone' in device and ratio < 0.45:
        return 'Entertainment (YouTube/OTT)'
    elif 'TV' in device:
        return 'Entertainment (YouTube/OTT)'
    elif 'Tablet' in device:
        return 'Gaming / Casual'
    else:
        return 'Mixed Use'

df['Primary_Use_Category'] = df.apply(primary_use, axis=1)

In [16]:
# Device Usage Context (device + location)
df['Device_Usage_Context'] = df['Urban_or_Rural'] + " " + df['Primary_Device']

In [17]:
# Weekday vs Weekend Usage (synthetic behavioral text)
df['Weekday_Weekend_Usage'] = np.where(
    df['Avg_Daily_Screen_Time_hr'] > 4,
    'Higher Usage on Weekends',
    'Lower Usage on Weekdays'
)

In [18]:
# usage Behavior Type (mix of screen time & education ratio)
def usage_behavior(row):
    st_level = row['ScreenTime_Level']
    ratio = row['Educational_to_Recreational_Ratio']
    if st_level == 'High (>6 hr)' and ratio < 0.4:
        return 'Overuse - Recreational Heavy'
    elif st_level == 'High (>6 hr)' and ratio >= 0.4:
        return 'Heavy but Balanced User'
    elif st_level == 'Moderate (3–6 hr)' and ratio >= 0.5:
        return 'Balanced User'
    elif st_level == 'Low (<3 hr)' and ratio >= 0.5:
        return 'Study Focused'
    else:
        return 'Recreational Focused'

df['Usage_Behavior_Type'] = df.apply(usage_behavior, axis=1)

In [19]:
# Health Risk 
risk_map = {
    'High Risk': 'Needs Attention',
    'Medium Risk': 'Moderate Risk',
    'Low Risk': 'Healthy Usage'
}
df['Health_Risk_Text'] = df['Risk_Category'].map(risk_map)

In [20]:
# Show samples
display(df[['Age', 'Gender', 'Primary_Device', 'Primary_Use_Category',
            'Weekday_Weekend_Usage', 'Usage_Behavior_Type', 'Health_Risk_Text']].head(10))


Unnamed: 0,Age,Gender,Primary_Device,Primary_Use_Category,Weekday_Weekend_Usage,Usage_Behavior_Type,Health_Risk_Text
0,14,Male,Smartphone,Entertainment (YouTube/OTT),Lower Usage on Weekdays,Recreational Focused,Healthy Usage
1,11,Female,Laptop,Mixed Use,Higher Usage on Weekends,Recreational Focused,Moderate Risk
2,18,Female,Tv,Mixed Use,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
3,15,Female,Laptop,Mixed Use,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
4,12,Female,Smartphone,Mixed Use,Higher Usage on Weekends,Recreational Focused,Healthy Usage
5,14,Female,Smartphone,Entertainment (YouTube/OTT),Higher Usage on Weekends,Recreational Focused,Healthy Usage
6,17,Male,Tv,Mixed Use,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
7,10,Male,Tv,Mixed Use,Lower Usage on Weekdays,Study Focused,Healthy Usage
8,14,Male,Laptop,Mixed Use,Higher Usage on Weekends,Recreational Focused,Healthy Usage
9,18,Male,Tablet,Gaming / Casual,Lower Usage on Weekdays,Recreational Focused,Healthy Usage


In [21]:
# Save the Cleaned Dataset

df.to_csv("screensense_cleaned_textbased.csv", index=False)
print("\n Saved as 'screensense_cleaned_textbased.csv' ")


 Saved as 'screensense_cleaned_textbased.csv' 


In [22]:
# Quick Check

print("\n Sample of Processed Data:\n")
display(df.head(10))


 Sample of Processed Data:



Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural,Age_Band,ScreenTime_Level,Edu_Recreational_%,Health_Impact_Count,Urban_Rural_Flag,Overuse_Index,Risk_Category,Primary_Use_Category,Device_Usage_Context,Weekday_Weekend_Usage,Usage_Behavior_Type,Health_Risk_Text
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban,Teen (14-16),Moderate (3–6 hr),42.0,2,1,2.31,Low Risk,Entertainment (YouTube/OTT),Urban Smartphone,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban,Pre-Teen (11-13),Moderate (3–6 hr),30.0,1,1,3.23,Medium Risk,Mixed Use,Urban Laptop,Higher Usage on Weekends,Recreational Focused,Moderate Risk
2,18,Female,3.73,Tv,True,0.32,Poor Sleep,Urban,Young Adult (17-18),Moderate (3–6 hr),32.0,1,1,2.54,Low Risk,Mixed Use,Urban Tv,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
3,15,Female,1.21,Laptop,False,0.39,,Urban,Teen (14-16),Low (<3 hr),39.0,0,1,0.74,Low Risk,Mixed Use,Urban Laptop,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban,Pre-Teen (11-13),Moderate (3–6 hr),49.0,2,1,3.0,Low Risk,Mixed Use,Urban Smartphone,Higher Usage on Weekends,Recreational Focused,Healthy Usage
5,14,Female,4.88,Smartphone,True,0.44,Poor Sleep,Urban,Teen (14-16),Moderate (3–6 hr),44.0,1,1,2.73,Low Risk,Entertainment (YouTube/OTT),Urban Smartphone,Higher Usage on Weekends,Recreational Focused,Healthy Usage
6,17,Male,2.97,Tv,False,0.48,,Rural,Young Adult (17-18),Low (<3 hr),48.0,0,0,1.54,Low Risk,Mixed Use,Rural Tv,Lower Usage on Weekdays,Recreational Focused,Healthy Usage
7,10,Male,2.74,Tv,True,0.54,,Urban,Child (8-10),Low (<3 hr),54.0,0,1,1.26,Low Risk,Mixed Use,Urban Tv,Lower Usage on Weekdays,Study Focused,Healthy Usage
8,14,Male,4.61,Laptop,True,0.36,"Poor Sleep, Anxiety",Rural,Teen (14-16),Moderate (3–6 hr),36.0,2,0,2.95,Low Risk,Mixed Use,Rural Laptop,Higher Usage on Weekends,Recreational Focused,Healthy Usage
9,18,Male,3.24,Tablet,True,0.48,"Poor Sleep, Obesity Risk",Urban,Young Adult (17-18),Moderate (3–6 hr),48.0,2,1,1.68,Low Risk,Gaming / Casual,Urban Tablet,Lower Usage on Weekdays,Recreational Focused,Healthy Usage


In [23]:
df.to_csv("screensense_cleaned.csv", index=False)


In [24]:
import os
print(os.getcwd())

C:\Users\geeky


In [25]:
df.to_csv(r"C:\Users\geeky\Downloads\screensense_cleaned.csv", index=False)
