In [1]:
import pandas as pd
import numpy as np

## Load Dataset 

In [2]:
data = pd.read_csv(r'Indian_Kids_Screen_Time.csv')

In [3]:
data.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


## Check Null Values

In [5]:
data.isnull().sum()

Age                                     0
Gender                                  0
Avg_Daily_Screen_Time_hr                0
Primary_Device                          0
Exceeded_Recommended_Limit              0
Educational_to_Recreational_Ratio       0
Health_Impacts                       3218
Urban_or_Rural                          0
dtype: int64

## Check Duplicated Values

In [6]:
data.duplicated().sum()

44

In [11]:
df = pd.DataFrame(data)
print(df)


      Age  Gender  Avg_Daily_Screen_Time_hr Primary_Device  \
0      14    Male                      3.99     Smartphone   
1      11  Female                      4.61         Laptop   
2      18  Female                      3.73             TV   
3      15  Female                      1.21         Laptop   
4      12  Female                      5.89     Smartphone   
...   ...     ...                       ...            ...   
9707   17    Male                      3.26     Smartphone   
9708   17  Female                      4.43     Smartphone   
9709   16    Male                      5.62     Smartphone   
9710   17    Male                      5.60             TV   
9711   15  Female                      6.12             TV   

      Exceeded_Recommended_Limit  Educational_to_Recreational_Ratio  \
0                           True                               0.42   
1                           True                               0.30   
2                           True          

## Check Correlation

In [14]:
df.corr(numeric_only=True)


Unnamed: 0,Age,Avg_Daily_Screen_Time_hr,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio
Age,1.0,0.118328,0.159173,-0.488617
Avg_Daily_Screen_Time_hr,0.118328,1.0,0.66495,-0.087552
Exceeded_Recommended_Limit,0.159173,0.66495,1.0,-0.126643
Educational_to_Recreational_Ratio,-0.488617,-0.087552,-0.126643,1.0


## Check Info of Columns

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                9712 non-null   int64  
 1   Gender                             9712 non-null   object 
 2   Avg_Daily_Screen_Time_hr           9712 non-null   float64
 3   Primary_Device                     9712 non-null   object 
 4   Exceeded_Recommended_Limit         9712 non-null   bool   
 5   Educational_to_Recreational_Ratio  9712 non-null   float64
 6   Health_Impacts                     6494 non-null   object 
 7   Urban_or_Rural                     9712 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 540.7+ KB


## Check Datatype of columns

In [18]:
data.dtypes

Age                                    int64
Gender                                object
Avg_Daily_Screen_Time_hr             float64
Primary_Device                        object
Exceeded_Recommended_Limit              bool
Educational_to_Recreational_Ratio    float64
Health_Impacts                        object
Urban_or_Rural                        object
dtype: object

## IQR for ML Models

In [19]:
numeric_cols = ['Age', 'Avg_Daily_Screen_Time_hr', 'Educational_to_Recreational_Ratio']
outlier_flags = pd.DataFrame(index=df.index)
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
outlier_flags[col + '_Outlier'] = ~df[col].between(lower_bound, upper_bound)
outlier_flags['Any_Outlier'] = outlier_flags.any(axis=1)
df_outliers = df.join(outlier_flags)
print("Outlier Counts:\n", outlier_flags.sum())
print("\nSample Outliers:\n", df_outliers[df_outliers['Any_Outlier']].head(10))


Outlier Counts:
 Age_Outlier                                    0
Avg_Daily_Screen_Time_hr_Outlier             448
Educational_to_Recreational_Ratio_Outlier      0
Any_Outlier                                  448
dtype: int64

Sample Outliers:
      Age  Gender  Avg_Daily_Screen_Time_hr Primary_Device  \
19     9  Female                      0.00             TV   
28     8    Male                      0.31     Smartphone   
36    10    Male                     11.68             TV   
46     9  Female                      0.00     Smartphone   
58     9    Male                      0.00     Smartphone   
92    10  Female                      0.00         Tablet   
96    10    Male                      0.00     Smartphone   
107    9  Female                      0.45     Smartphone   
145    9  Female                      0.00         Tablet   
180    8    Male                      0.30             TV   

     Exceeded_Recommended_Limit  Educational_to_Recreational_Ratio  \
19           