# Indian Kids Screen Time Analysis
**Author:** Subhash Kadiyam

**Deliverables:**
- Compare device mix and activity categories across demographics
- Visualize weekday vs. weekend differences and time patterns (see note)
- Minimum 8 visuals + observations on peak usage cohorts

---

In [None]:
# Cell: Imports and settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

RANDOM_STATE = 42


In [None]:
# Cell: Load dataset
DATA_PATH = r"/mnt/data/Indian_Kids_Screen_Time.csv"
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()

In [None]:
# Cell: Basic info, dtypes and missing values
df.info()
print('\nMissing values per column:')
print(df.isnull().sum())

In [None]:
# Cell: Unique values for key categorical columns
for col in ['Primary_Device', 'Gender', 'Urban_or_Rural', 'Health_Impacts']:
    if col in df.columns:
        print(col, '->', df[col].nunique(), 'unique values')
        print(df[col].value_counts().head(10))
        print('---')

In [None]:
# Cell: Derived columns: Age groups and Activity Category (from Educational_to_Recreational_Ratio)
# Age groups: 0-4,5-7,8-10,11-13,14+ (adapt if needed)
if 'Age' in df.columns:
    bins = [0,4,7,10,13,100]
    labels = ['0-4','5-7','8-10','11-13','14+']
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)
else:
    df['Age_Group'] = 'Unknown'

# Activity category from Educational_to_Recreational_Ratio:
# Higher ratio -> more educational. We'll bin into Mostly Recreational, Mixed, Mostly Educational.
if 'Educational_to_Recreational_Ratio' in df.columns:
    df['Activity_Category'] = pd.cut(df['Educational_to_Recreational_Ratio'],
                                     bins=[-1, 0.8, 1.2, 100],
                                     labels=['Mostly Recreational','Mixed','Mostly Educational'])
else:
    df['Activity_Category'] = 'Unknown'

df[['Age','Age_Group','Educational_to_Recreational_Ratio','Activity_Category']].head()

In [None]:
# Cell: Plot 1 - Device mix (overall)
plt.figure(figsize=(6,4))
if 'Primary_Device' in df.columns:
    counts = df['Primary_Device'].value_counts()
    counts.plot(kind='bar')
    plt.title('Device mix (overall)')
    plt.ylabel('Count')
    plt.xlabel('Primary Device')
    plt.tight_layout()
else:
    print('Primary_Device column not found')


In [None]:
# Cell: Plot 2 - Device mix by Gender
plt.figure(figsize=(8,4))
if ('Primary_Device' in df.columns) and ('Gender' in df.columns):
    pivot = df.pivot_table(index='Primary_Device', columns='Gender', aggfunc='size', fill_value=0)
    pivot.plot(kind='bar', stacked=False, figsize=(8,4))
    plt.title('Device mix by Gender')
    plt.ylabel('Count')
    plt.xlabel('Primary Device')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Plot 3 - Device mix by Age Group
plt.figure(figsize=(8,4))
if ('Primary_Device' in df.columns) and ('Age_Group' in df.columns):
    pivot = df.pivot_table(index='Age_Group', columns='Primary_Device', aggfunc='size', fill_value=0)
    pivot.plot(kind='bar', stacked=True, figsize=(8,4))
    plt.title('Device mix by Age Group')
    plt.ylabel('Count')
    plt.xlabel('Age Group')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Plot 4 - Device mix by Urban_or_Rural
plt.figure(figsize=(8,4))
if ('Primary_Device' in df.columns) and ('Urban_or_Rural' in df.columns):
    pivot = df.pivot_table(index='Urban_or_Rural', columns='Primary_Device', aggfunc='size', fill_value=0)
    pivot.plot(kind='bar', stacked=True, figsize=(8,4))
    plt.title('Device mix by Urban/Rural')
    plt.ylabel('Count')
    plt.xlabel('Urban or Rural')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Plot 5 - Activity category (overall)
plt.figure(figsize=(6,4))
if 'Activity_Category' in df.columns:
    counts = df['Activity_Category'].value_counts()
    counts.plot(kind='bar')
    plt.title('Activity category (derived from Educational_to_Recreational_Ratio)')
    plt.ylabel('Count')
    plt.xlabel('Activity Category')
    plt.tight_layout()
else:
    print('Activity_Category not available')


In [None]:
# Cell: Plot 6 - Activity category by Gender
plt.figure(figsize=(8,4))
if ('Activity_Category' in df.columns) and ('Gender' in df.columns):
    pivot = df.pivot_table(index='Activity_Category', columns='Gender', aggfunc='size', fill_value=0)
    pivot.plot(kind='bar', figsize=(8,4))
    plt.title('Activity category by Gender')
    plt.ylabel('Count')
    plt.xlabel('Activity Category')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Plot 7 - Avg Daily Screen Time by Age Group
plt.figure(figsize=(8,4))
if ('Avg_Daily_Screen_Time_hr' in df.columns) and ('Age_Group' in df.columns):
    agg = df.groupby('Age_Group')['Avg_Daily_Screen_Time_hr'].mean()
    agg.plot(kind='bar')
    plt.title('Average Daily Screen Time (hours) by Age Group')
    plt.ylabel('Avg Daily Screen Time (hr)')
    plt.xlabel('Age Group')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Plot 8 - Avg Daily Screen Time by Primary Device
plt.figure(figsize=(8,4))
if ('Avg_Daily_Screen_Time_hr' in df.columns) and ('Primary_Device' in df.columns):
    agg = df.groupby('Primary_Device')['Avg_Daily_Screen_Time_hr'].mean().sort_values(ascending=False)
    agg.plot(kind='bar')
    plt.title('Average Daily Screen Time (hours) by Primary Device')
    plt.ylabel('Avg Daily Screen Time (hr)')
    plt.xlabel('Primary Device')
    plt.tight_layout()
else:
    print('Required columns not found')


In [None]:
# Cell: Peak usage cohorts (summary)
if 'Avg_Daily_Screen_Time_hr' in df.columns:
    print('Top Age Groups by avg screen time:')
    if 'Age_Group' in df.columns:
        print(df.groupby('Age_Group')['Avg_Daily_Screen_Time_hr'].mean().sort_values(ascending=False))
    print('\nTop Devices by avg screen time:')
    if 'Primary_Device' in df.columns:
        print(df.groupby('Primary_Device')['Avg_Daily_Screen_Time_hr'].mean().sort_values(ascending=False))
    print('\nTop by Gender:')
    if 'Gender' in df.columns:
        print(df.groupby('Gender')['Avg_Daily_Screen_Time_hr'].mean().sort_values(ascending=False))
else:
    print('Avg_Daily_Screen_Time_hr not found')


## Weekday vs Weekend differences & time patterns

**Important:** The current dataset does *not* contain session-level timestamps or a column indicating day-of-week / weekday vs weekend. That means we cannot directly compute weekday vs weekend patterns or hourly time-of-day usage from this dataset.

If you have a session-level file with a timestamp (for example columns like `session_start`, `session_end`, or `day_of_week`), paste it in and run the template cell below. The template shows how to create `day_of_week` and `day_type` and aggregate by hour.

In [None]:
# Cell: Template for weekday vs weekend analysis (run if you have session-level timestamps)
# Example assumes you have a dataframe `sessions` with columns: 'user_id', 'session_start' (datetime), 'duration_min' (numeric)
# and optionally 'Primary_Device' or 'Activity_Category'.
# The template shows how to create 'day_of_week' and 'day_type' and then aggregate.

# sessions['session_start'] = pd.to_datetime(sessions['session_start'])
# sessions['day_of_week'] = sessions['session_start'].dt.day_name()
# sessions['hour'] = sessions['session_start'].dt.hour
# sessions['day_type'] = sessions['session_start'].dt.weekday.apply(lambda x: 'Weekend' if x>=5 else 'Weekday')
# agg = sessions.groupby(['hour','day_type'])['duration_min'].mean().unstack()
# agg.plot(kind='bar')
# plt.title('Average session duration: Weekday vs Weekend by hour')
# plt.show()

print('This is a template. Replace `sessions` with your session-level DataFrame and uncomment code to run.')

In [None]:
# Cell: Auto-detect day columns and run weekday/weekend analysis if available
possible_cols = [c for c in df.columns if c.lower() in ['daytype','day_type','day_of_week','dayofweek','day','weekday']]
print('Detected potential day columns:', possible_cols)
if len(possible_cols) > 0:
    col = possible_cols[0]
    print('Using column:', col)
    try:
        s = df[col].astype(str)
        if s.str.lower().isin(['weekday','weekend']).any():
            agg = df.groupby(col)['Avg_Daily_Screen_Time_hr'].mean()
            print('Average screen time by', col)
            print(agg)
        else:
            print('Column found but does not contain explicit Weekday/Weekend values. Values sample:')
            print(s.unique()[:20])
    except Exception as e:
        print('Could not analyze day column:', e)
else:
    print('No day-of-week / weekday columns found in dataset. Cannot compute weekday vs weekend differences.')

## Observations & Peak Usage Cohorts

- (Auto-computed summaries are printed in the previous cell.)

- **Sample observations you can include:**
  - Which age groups have the highest average screen time
  - Which device(s) correspond to highest average screen time
  - Differences by gender or urban/rural status
  - Recommendations or next steps (e.g., collect session-level timestamps for weekday/weekend analysis)

