In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df= pd.read_csv("Indian_Kids_Screen_Time.csv")

In [63]:
# Analyze missing values in detail
print("="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)


missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df) * 100).round(2)

missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})

print("Missing Values Summary:")
print(missing_summary)


if 'Health_Impacts' in df.columns:
    print(f"\nHealth_Impacts Column Analysis:")
    print(f"Total missing: {df['Health_Impacts'].isnull().sum()}")
    print(f"Percentage missing: {df['Health_Impacts'].isnull().sum()/len(df)*100:.2f}%")
    print(f"Unique non-null values: {df['Health_Impacts'].nunique()}")
    print(f"\nSample of non-null values:")
    print(df['Health_Impacts'].dropna().value_counts().head(10))


MISSING VALUES ANALYSIS
Missing Values Summary:
                              Column  Missing_Count  Missing_Percentage
0                                Age              0                0.00
1                             Gender              0                0.00
2           Avg_Daily_Screen_Time_hr              0                0.00
3                     Primary_Device              0                0.00
4         Exceeded_Recommended_Limit              0                0.00
5  Educational_to_Recreational_Ratio              0                0.00
6                     Health_Impacts           3218               33.13
7                     Urban_or_Rural              0                0.00

Health_Impacts Column Analysis:
Total missing: 3218
Percentage missing: 33.13%
Unique non-null values: 15

Sample of non-null values:
Health_Impacts
Poor Sleep                              2268
Poor Sleep, Eye Strain                   979
Eye Strain                               644
Poor Sleep, Anxiety

In [64]:
# Create copy for cleaning
df_clean = df.copy()

# Fill missing Health_Impacts with 'None'
df_clean['Health_Impacts'] = df_clean['Health_Impacts'].fillna('None')

print("Missing values after fixing:")
print(df_clean.isnull().sum())


Missing values after fixing:
Age                                  0
Gender                               0
Avg_Daily_Screen_Time_hr             0
Primary_Device                       0
Exceeded_Recommended_Limit           0
Educational_to_Recreational_Ratio    0
Health_Impacts                       0
Urban_or_Rural                       0
dtype: int64


In [65]:
# Standardize Gender
print("Original Gender values:", df_clean['Gender'].unique())

# Clean gender values (already clean in this dataset)
df_clean['Gender'] = df_clean['Gender'].str.strip()

print("Cleaned Gender values:", df_clean['Gender'].unique())
print(df_clean['Gender'].value_counts())


Original Gender values: ['Male' 'Female']
Cleaned Gender values: ['Male' 'Female']
Gender
Male      4942
Female    4770
Name: count, dtype: int64


In [66]:
# Clean Health_Impacts column
print("Sample Health_Impacts:")
print(df_clean['Health_Impacts'].value_counts().head(10))

# Simple cleaning function
def clean_health_impacts(text):
    if pd.isna(text) or text == 'None':
        return 'None'

    # Split by comma and clean
    impacts = [impact.strip() for impact in str(text).split(',')]
    clean_impacts = []

    for impact in impacts:
        if 'sleep' in impact.lower():
            clean_impacts.append('Poor Sleep')
        elif 'eye' in impact.lower():
            clean_impacts.append('Eye Strain')
        elif 'anxiety' in impact.lower():
            clean_impacts.append('Anxiety')
        elif 'obesity' in impact.lower():
            clean_impacts.append('Obesity Risk')
        elif impact.strip() and 'none' not in impact.lower():
            clean_impacts.append(impact.strip())

    return ', '.join(sorted(set(clean_impacts))) if clean_impacts else 'None'

df_clean['Health_Impacts_Clean'] = df_clean['Health_Impacts'].apply(clean_health_impacts)

print("Cleaned Health_Impacts:")
print(df_clean['Health_Impacts_Clean'].value_counts().head(10))


Sample Health_Impacts:
Health_Impacts
None                                    3218
Poor Sleep                              2268
Poor Sleep, Eye Strain                   979
Eye Strain                               644
Poor Sleep, Anxiety                      608
Poor Sleep, Obesity Risk                 452
Anxiety                                  385
Poor Sleep, Eye Strain, Anxiety          258
Obesity Risk                             252
Poor Sleep, Eye Strain, Obesity Risk     188
Name: count, dtype: int64
Cleaned Health_Impacts:
Health_Impacts_Clean
None                                    3218
Poor Sleep                              2268
Eye Strain, Poor Sleep                   979
Eye Strain                               644
Anxiety, Poor Sleep                      608
Obesity Risk, Poor Sleep                 452
Anxiety                                  385
Anxiety, Eye Strain, Poor Sleep          258
Obesity Risk                             252
Eye Strain, Obesity Risk, Poor Sleep

In [67]:
# Standardize Primary_Device
print("Original Device values:", df_clean['Primary_Device'].unique())

# Device mapping
device_map = {
    'smartphone': 'Smartphone',
    'laptop': 'Laptop',
    'tv': 'TV',
    'tablet': 'Tablet'
}

# Apply mapping (case insensitive)
for old, new in device_map.items():
    df_clean['Primary_Device'] = df_clean['Primary_Device'].str.replace(old, new, case=False)

print("Cleaned Device values:", df_clean['Primary_Device'].unique())
print(df_clean['Primary_Device'].value_counts())


Original Device values: ['Smartphone' 'Laptop' 'TV' 'Tablet']
Cleaned Device values: ['Smartphone' 'Laptop' 'TV' 'Tablet']
Primary_Device
Smartphone    4568
TV            2487
Laptop        1433
Tablet        1224
Name: count, dtype: int64


In [68]:
# Create age groups
def make_age_groups(age):
    if age <= 10:
        return 'Children (â‰¤10)'
    elif age <= 14:
        return 'Pre-teens (11-14)'
    else:
        return 'Teenagers (15+)'

df_clean['Age_Group'] = df_clean['Age'].apply(make_age_groups)

print("Age groups created:")
print(df_clean['Age_Group'].value_counts())


Age groups created:
Age_Group
Pre-teens (11-14)    3539
Teenagers (15+)      3499
Children (â‰¤10)       2674
Name: count, dtype: int64


In [69]:


# Screen time intensity
def screen_intensity(hours):
    if hours <= 2:
        return 'Light'
    elif hours <= 4:
        return 'Moderate'
    elif hours <= 6:
        return 'Heavy'
    else:
        return 'Extreme'

df_clean['Screen_Intensity'] = df_clean['Avg_Daily_Screen_Time_hr'].apply(screen_intensity)

# print("Activity shares created:")
# print("Educational share range:", df_clean['Educational_Share'].min(), "to", df_clean['Educational_Share'].max())
print("Screen intensity:")
print(df_clean['Screen_Intensity'].value_counts())


Screen intensity:
Screen_Intensity
Heavy       4596
Moderate    2930
Extreme     1354
Light        832
Name: count, dtype: int64


In [70]:
# Create binary columns for each health impact
health_impacts = ['Poor Sleep', 'Eye Strain', 'Anxiety', 'Obesity Risk']

for impact in health_impacts:
    col_name = f'Has_{impact.replace(" ", "_")}'
    df_clean[col_name] = df_clean['Health_Impacts_Clean'].str.contains(impact, na=False)

    count = df_clean[col_name].sum()
    print(f"{col_name}: {count} cases ({count/len(df_clean)*100:.1f}%)")

# Count total health impacts
df_clean['Total_Health_Issues'] = df_clean['Health_Impacts_Clean'].apply(
    lambda x: 0 if x == 'None' else len([i for i in str(x).split(',') if i.strip()])
)

print(f"\nTotal health issues distribution:")
print(df_clean['Total_Health_Issues'].value_counts().sort_index())


Has_Poor_Sleep: 4868 cases (50.1%)
Has_Eye_Strain: 2382 cases (24.5%)
Has_Anxiety: 1605 cases (16.5%)
Has_Obesity_Risk: 1217 cases (12.5%)

Total health issues distribution:
Total_Health_Issues
0    3218
1    3549
2    2349
3     559
4      37
Name: count, dtype: int64


In [73]:
# Dataset summary
print("FINAL DATASET SUMMARY")
print("=" * 40)
print(f"Total rows: {len(df_clean):,}")
print(f"Total columns: {len(df_clean.columns)}")
print(f"Original columns: {len(df.columns)}")
print(f"New columns added: {len(df_clean.columns) - len(df.columns)}")

print(f"\nMissing values: {df_clean.isnull().sum().sum()}")


print(f"\nNew columns created:")
new_cols = set(df_clean.columns) - set(df.columns)
for i, col in enumerate(sorted(new_cols), 1):
    print(f"  {i}. {col}")


FINAL DATASET SUMMARY
Total rows: 9,712
Total columns: 16
Original columns: 8
New columns added: 8

Missing values: 0

New columns created:
  1. Age_Group
  2. Has_Anxiety
  3. Has_Eye_Strain
  4. Has_Obesity_Risk
  5. Has_Poor_Sleep
  6. Health_Impacts_Clean
  7. Screen_Intensity
  8. Total_Health_Issues


In [72]:
# Save the cleaned dataset


output_filename = f'Clean_Screen_Time_Data_.csv'

df_clean.to_csv(output_filename, index=False)
print(f"âœ… Cleaned data saved as: {output_filename}")


print(f"\nðŸ“Š Final dataset shape: {df_clean.shape}")
print("ðŸŽ‰ Data cleaning complete!")


âœ… Cleaned data saved as: Clean_Screen_Time_Data_.csv

ðŸ“Š Final dataset shape: (9712, 16)
ðŸŽ‰ Data cleaning complete!
