In [12]:
import pandas as pd
import numpy as np
import sklearn
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('digital-lifestyle.csv')

In [3]:
df.head()

Unnamed: 0,id,age,gender,region,income_level,education_level,daily_role,device_hours_per_day,phone_unlocks,notifications_per_day,...,sleep_quality,anxiety_score,depression_score,stress_level,happiness_score,focus_score,high_risk_flag,device_type,productivity_score,digital_dependence_score
0,1,40,Female,Asia,High,High School,Part-time/Shift,3.54,45,561,...,3.353627,9.926651,5.0,6.593289,8.0,23.0,0,Android,70.0,25.7
1,2,27,Male,Africa,Lower-Mid,Master,Full-time Employee,5.65,100,393,...,2.908147,4.0,4.0,4.126926,8.1,35.0,0,Laptop,64.0,30.1
2,3,31,Male,North America,Lower-Mid,Bachelor,Full-time Employee,8.87,181,231,...,2.889213,4.0,8.0,1.429139,7.6,15.0,0,Android,65.299301,40.6
3,4,41,Female,Middle East,Low,Master,Caregiver/Home,4.05,94,268,...,3.097488,7.093357,9.0,4.995512,7.8,28.0,1,Tablet,80.0,36.684152
4,5,26,Female,Europe,Lower-Mid,Bachelor,Full-time Employee,13.07,199,91,...,2.786098,7.028125,15.0,9.448757,4.2,70.0,1,Android,65.299301,48.4


In [4]:
df.dtypes

id                            int64
age                           int64
gender                       object
region                       object
income_level                 object
education_level              object
daily_role                   object
device_hours_per_day        float64
phone_unlocks                 int64
notifications_per_day         int64
social_media_mins             int64
study_mins                    int64
physical_activity_days      float64
sleep_hours                 float64
sleep_quality               float64
anxiety_score               float64
depression_score            float64
stress_level                float64
happiness_score             float64
focus_score                 float64
high_risk_flag                int64
device_type                  object
productivity_score          float64
digital_dependence_score    float64
dtype: object

In [5]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for cat_col in categorical_columns:
    df[cat_col] = df[cat_col].str.lower().str.replace(' ', '_')

In [6]:
df.head()

Unnamed: 0,id,age,gender,region,income_level,education_level,daily_role,device_hours_per_day,phone_unlocks,notifications_per_day,...,sleep_quality,anxiety_score,depression_score,stress_level,happiness_score,focus_score,high_risk_flag,device_type,productivity_score,digital_dependence_score
0,1,40,female,asia,high,high_school,part-time/shift,3.54,45,561,...,3.353627,9.926651,5.0,6.593289,8.0,23.0,0,android,70.0,25.7
1,2,27,male,africa,lower-mid,master,full-time_employee,5.65,100,393,...,2.908147,4.0,4.0,4.126926,8.1,35.0,0,laptop,64.0,30.1
2,3,31,male,north_america,lower-mid,bachelor,full-time_employee,8.87,181,231,...,2.889213,4.0,8.0,1.429139,7.6,15.0,0,android,65.299301,40.6
3,4,41,female,middle_east,low,master,caregiver/home,4.05,94,268,...,3.097488,7.093357,9.0,4.995512,7.8,28.0,1,tablet,80.0,36.684152
4,5,26,female,europe,lower-mid,bachelor,full-time_employee,13.07,199,91,...,2.786098,7.028125,15.0,9.448757,4.2,70.0,1,android,65.299301,48.4


In [7]:
df.isnull().sum()

id                          0
age                         0
gender                      0
region                      0
income_level                0
education_level             0
daily_role                  0
device_hours_per_day        0
phone_unlocks               0
notifications_per_day       0
social_media_mins           0
study_mins                  0
physical_activity_days      0
sleep_hours                 0
sleep_quality               0
anxiety_score               0
depression_score            0
stress_level                0
happiness_score             0
focus_score                 0
high_risk_flag              0
device_type                 0
productivity_score          0
digital_dependence_score    0
dtype: int64

In [8]:
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)
categorical_features

['gender',
 'region',
 'income_level',
 'education_level',
 'daily_role',
 'device_type']

In [9]:
numerical = list(df.select_dtypes(include=['int64', 'float64']).columns)
numerical

['id',
 'age',
 'device_hours_per_day',
 'phone_unlocks',
 'notifications_per_day',
 'social_media_mins',
 'study_mins',
 'physical_activity_days',
 'sleep_hours',
 'sleep_quality',
 'anxiety_score',
 'depression_score',
 'stress_level',
 'happiness_score',
 'focus_score',
 'high_risk_flag',
 'productivity_score',
 'digital_dependence_score']

In [10]:
numerical_features = [col for col in numerical if col != 'high_risk_flag']
numerical_features

['id',
 'age',
 'device_hours_per_day',
 'phone_unlocks',
 'notifications_per_day',
 'social_media_mins',
 'study_mins',
 'physical_activity_days',
 'sleep_hours',
 'sleep_quality',
 'anxiety_score',
 'depression_score',
 'stress_level',
 'happiness_score',
 'focus_score',
 'productivity_score',
 'digital_dependence_score']

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [15]:
len(df_train), len(df_val), len(df_test)

(2100, 700, 700)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.high_risk_flag.values
y_val = df_val.high_risk_flag.values
y_test = df_test.high_risk_flag.values

In [18]:
del df_train['high_risk_flag']
del df_val['high_risk_flag']
del df_test['high_risk_flag']

In [19]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [20]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]
    convert_decision = (y_pred >= 0.5)
    accuracy = (y_val == convert_decision).mean().round(3)
    accuracies[C] = accuracy

for c, a in accuracies.items():
    print(f"C={c}: accuracy={a}")

best_C = min([c for c in C_values if accuracies[c] == max(accuracies.values())])
print(f"\n Best C: {best_C} with accuracy {accuracies[best_C]}")

C=0.01: accuracy=0.811
C=0.1: accuracy=0.817
C=1: accuracy=0.823
C=10: accuracy=0.823
C=100: accuracy=0.819

 Best C: 1 with accuracy 0.823
