# Importing Libraries

In [460]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

# Project 
## Screentime ML Predictive Analysis Of Indian Kids (2025)

In [461]:
df=pd.read_csv(r"c:\Users\user\Desktop\Kaggle Datasets\Indian_Kids_Screen_Time.csv")

# Data Info

In [462]:
df.shape

(9712, 8)

In [463]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9712 entries, 0 to 9711
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                9712 non-null   int64  
 1   Gender                             9712 non-null   object 
 2   Avg_Daily_Screen_Time_hr           9712 non-null   float64
 3   Primary_Device                     9712 non-null   object 
 4   Exceeded_Recommended_Limit         9712 non-null   bool   
 5   Educational_to_Recreational_Ratio  9712 non-null   float64
 6   Health_Impacts                     6494 non-null   object 
 7   Urban_or_Rural                     9712 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 540.7+ KB


In [464]:
df.head() 

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,15,Female,1.21,Laptop,False,0.39,,Urban
4,12,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban


# Data  Handling & Cleaning 

In [465]:
df.isna().sum()
df["Health_Impacts"]=df["Health_Impacts"].fillna(df["Health_Impacts"].mode()[0])
df.isna().sum()
df.duplicated().sum()

np.int64(47)

In [466]:
df["Health_Impacts"]=df["Health_Impacts"].str.split(",")
df = df.explode("Health_Impacts").reset_index(drop=True)
df["Health_Impacts"] = df["Health_Impacts"].str.strip()

In [467]:
df.head()

Unnamed: 0,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,14,Male,3.99,Smartphone,True,0.42,Poor Sleep,Urban
1,14,Male,3.99,Smartphone,True,0.42,Eye Strain,Urban
2,11,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
3,18,Female,3.73,TV,True,0.32,Poor Sleep,Urban
4,15,Female,1.21,Laptop,False,0.39,Poor Sleep,Urban


# Machine Learning

In [468]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13290 entries, 0 to 13289
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                13290 non-null  int64  
 1   Gender                             13290 non-null  object 
 2   Avg_Daily_Screen_Time_hr           13290 non-null  float64
 3   Primary_Device                     13290 non-null  object 
 4   Exceeded_Recommended_Limit         13290 non-null  bool   
 5   Educational_to_Recreational_Ratio  13290 non-null  float64
 6   Health_Impacts                     13290 non-null  object 
 7   Urban_or_Rural                     13290 non-null  object 
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 739.9+ KB


In [469]:
df_encoded=df.drop(columns=["Gender"],inplace=True)
df_encoded = pd.get_dummies(df.select_dtypes(include=["object", "bool"]))
df_encoded=df_encoded.astype("int")  #Converting bool columns into numeric for machine learning proccess 
numeric_df = df.select_dtypes(include=["int64", "float64"])
final_df = pd.concat([df_encoded, numeric_df], axis=1)

In [470]:
final_df

Unnamed: 0,Exceeded_Recommended_Limit,Primary_Device_Laptop,Primary_Device_Smartphone,Primary_Device_TV,Primary_Device_Tablet,Health_Impacts_Anxiety,Health_Impacts_Eye Strain,Health_Impacts_Obesity Risk,Health_Impacts_Poor Sleep,Urban_or_Rural_Rural,Urban_or_Rural_Urban,Age,Avg_Daily_Screen_Time_hr,Educational_to_Recreational_Ratio
0,1,0,1,0,0,0,0,0,1,0,1,14,3.99,0.42
1,1,0,1,0,0,0,1,0,0,0,1,14,3.99,0.42
2,1,1,0,0,0,0,0,0,1,0,1,11,4.61,0.30
3,1,0,0,1,0,0,0,0,1,0,1,18,3.73,0.32
4,0,1,0,0,0,0,0,0,1,0,1,15,1.21,0.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13285,1,0,1,0,0,0,0,0,1,1,0,16,5.62,0.39
13286,1,0,1,0,0,0,1,0,0,1,0,16,5.62,0.39
13287,1,0,1,0,0,1,0,0,0,1,0,16,5.62,0.39
13288,1,0,0,1,0,0,0,0,1,0,1,17,5.60,0.43


# Logistic Regression Model 

In [471]:
x=final_df.drop(columns=["Exceeded_Recommended_Limit"])
y=final_df["Exceeded_Recommended_Limit"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [None]:
model_lg=LogisticRegression()
model_lg.fit(x_train_scaled,y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
y_pred=model_lg.predict(x_test_scaled)
acc=accuracy_score(y_test,y_pred)
acc

0.9811888638073739

In [474]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# If you have predicted probabilities
y_prob = model_lg.predict_proba(x_test_scaled)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[ 267   30]
 [  20 2341]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.91       297
           1       0.99      0.99      0.99      2361

    accuracy                           0.98      2658
   macro avg       0.96      0.95      0.95      2658
weighted avg       0.98      0.98      0.98      2658

ROC-AUC Score: 0.9970508416082324


In [476]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9812
Precision: 0.9873
Recall: 0.9915
F1 Score: 0.9894
