In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)
rows = 20000

df = pd.DataFrame({

    # üõå Sleep & Routine
    "sleep_hours": np.round(np.random.uniform(3, 10, rows), 1),
    "sleep_quality": np.random.randint(0, 11, rows),
    "bed_time_period": np.random.choice(
        ["Early", "Normal", "Late"], rows, p=[0.3, 0.5, 0.2]
    ),
    "sleep_consistency": np.random.choice(
        ["Regular", "Irregular"], rows, p=[0.65, 0.35]
    ),
    "nap_minutes": np.random.randint(0, 121, rows),
    "sleep_disturbances": np.random.randint(0, 6, rows),

    # üì± Digital Habits (CLEANED)
    "screen_time_hours": np.round(np.random.uniform(1, 12, rows), 1),
    "late_night_screen": np.random.choice(["Yes", "No"], rows, p=[0.4, 0.6]),
    "app_usage_type": np.random.choice(
        ["Work", "Social", "Entertainment"], rows, p=[0.35, 0.4, 0.25]
    ),

    # üçΩ Food & Health
    "daily_calories": np.random.randint(1200, 4000, rows),
    "junk_food_servings": np.random.randint(0, 6, rows),
    "healthy_meals_count": np.random.randint(0, 4, rows),
    "water_intake_liters": np.round(np.random.uniform(1, 5, rows), 1),
    "diet_type": np.random.choice(
        ["Vegetarian", "Non-Vegetarian", "Vegan"], rows, p=[0.4, 0.45, 0.15]
    ),
    "meal_timings_regular": np.random.choice(["Yes", "No"], rows, p=[0.6, 0.4]),

    # üèÉ Physical Activity
    "physical_activity_minutes": np.random.randint(0, 180, rows),
    "exercise_type": np.random.choice(
        ["None", "Cardio", "Strength", "Yoga"], rows, p=[0.3, 0.3, 0.25, 0.15]
    ),
    "activity_intensity": np.random.choice(
        ["Low", "Moderate", "High"], rows, p=[0.4, 0.4, 0.2]
    ),

    # üß† Mental & Productivity
    "stress_level": np.random.randint(0, 11, rows),
    "focus_level": np.random.randint(0, 11, rows),

    # üåç Lifestyle Context
    "occupation_type": np.random.choice(
        ["Student", "Employee", "Freelancer"], rows, p=[0.35, 0.45, 0.2]
    ),
    "outdoor_time_minutes": np.random.randint(0, 240, rows),
    "work_environment": np.random.choice(
        ["Home", "Office", "Hybrid"], rows, p=[0.4, 0.4, 0.2]
    ),
    "habit_streak_days": np.random.randint(0, 120, rows),
    "reward_indulgence": np.random.choice(["Yes", "No"], rows, p=[0.4, 0.6])
})

# üéØ Habit Quality Score (Regression Target)
habit_score = (
    df.sleep_hours * 6 +
    df.sleep_quality * 4 +
    df.physical_activity_minutes * 0.12 +
    df.healthy_meals_count * 5 +
    df.water_intake_liters * 4 +
    df.focus_level * 3 +
    df.habit_streak_days * 0.2 -
    df.screen_time_hours * 2 -
    df.junk_food_servings * 4 -
    df.stress_level * 3
)

df["habit_quality_score"] = np.clip(
    (habit_score - habit_score.min()) /
    (habit_score.max() - habit_score.min()) * 100,
    0, 100
).round(2)

# Save dataset
df.to_csv("habit_dataset", index=False)

print(df.shape)
print(df.head())
print(df.dtypes)


(20000, 26)
   sleep_hours  sleep_quality bed_time_period sleep_consistency  nap_minutes  \
0          5.6              5          Normal           Regular           82   
1          9.7              1            Late         Irregular           93   
2          8.1              3           Early           Regular           83   
3          7.2              6          Normal           Regular           51   
4          4.1             10          Normal           Regular           19   

   sleep_disturbances  screen_time_hours late_night_screen app_usage_type  \
0                   3                6.6               Yes         Social   
1                   4                1.4               Yes         Social   
2                   2               11.6               Yes  Entertainment   
3                   4               11.7               Yes  Entertainment   
4                   4                9.2               Yes           Work   

   daily_calories  ...  exercise_type  activ

In [171]:
df

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence,habit_quality_score
0,5.6,5,Normal,Regular,82,3,6.6,Yes,Social,3701,...,Yoga,Moderate,1,8,Employee,192,Home,38,No,55.81
1,9.7,1,Late,Irregular,93,4,1.4,Yes,Social,3242,...,Cardio,Moderate,10,9,Employee,181,Hybrid,111,Yes,50.35
2,8.1,3,Early,Regular,83,2,11.6,Yes,Entertainment,2374,...,Cardio,Low,10,8,Employee,40,Office,26,No,39.35
3,7.2,6,Normal,Regular,51,4,11.7,Yes,Entertainment,2068,...,Yoga,Moderate,1,9,Freelancer,221,Home,26,Yes,57.97
4,4.1,10,Normal,Regular,19,4,9.2,Yes,Work,1439,...,,Moderate,6,2,Student,103,Hybrid,29,Yes,44.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,9.1,8,Normal,Regular,77,0,7.5,Yes,Social,3429,...,,High,0,4,Freelancer,223,Home,17,No,71.58
19996,3.3,4,Early,Regular,10,5,3.0,No,Social,2080,...,Strength,Moderate,6,6,Student,92,Home,96,No,39.06
19997,5.1,10,Normal,Regular,11,5,2.5,Yes,Social,1557,...,Cardio,Moderate,10,0,Employee,75,Office,96,Yes,51.16
19998,6.1,8,Normal,Irregular,82,3,3.6,No,Social,1547,...,,Moderate,1,9,Employee,195,Home,96,No,77.86


In [172]:
df.shape

(20000, 26)

In [173]:
df.columns

Index(['sleep_hours', 'sleep_quality', 'bed_time_period', 'sleep_consistency',
       'nap_minutes', 'sleep_disturbances', 'screen_time_hours',
       'late_night_screen', 'app_usage_type', 'daily_calories',
       'junk_food_servings', 'healthy_meals_count', 'water_intake_liters',
       'diet_type', 'meal_timings_regular', 'physical_activity_minutes',
       'exercise_type', 'activity_intensity', 'stress_level', 'focus_level',
       'occupation_type', 'outdoor_time_minutes', 'work_environment',
       'habit_streak_days', 'reward_indulgence', 'habit_quality_score'],
      dtype='object')

In [174]:
df.head()

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence,habit_quality_score
0,5.6,5,Normal,Regular,82,3,6.6,Yes,Social,3701,...,Yoga,Moderate,1,8,Employee,192,Home,38,No,55.81
1,9.7,1,Late,Irregular,93,4,1.4,Yes,Social,3242,...,Cardio,Moderate,10,9,Employee,181,Hybrid,111,Yes,50.35
2,8.1,3,Early,Regular,83,2,11.6,Yes,Entertainment,2374,...,Cardio,Low,10,8,Employee,40,Office,26,No,39.35
3,7.2,6,Normal,Regular,51,4,11.7,Yes,Entertainment,2068,...,Yoga,Moderate,1,9,Freelancer,221,Home,26,Yes,57.97
4,4.1,10,Normal,Regular,19,4,9.2,Yes,Work,1439,...,,Moderate,6,2,Student,103,Hybrid,29,Yes,44.42


In [175]:
df.tail()

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence,habit_quality_score
19995,9.1,8,Normal,Regular,77,0,7.5,Yes,Social,3429,...,,High,0,4,Freelancer,223,Home,17,No,71.58
19996,3.3,4,Early,Regular,10,5,3.0,No,Social,2080,...,Strength,Moderate,6,6,Student,92,Home,96,No,39.06
19997,5.1,10,Normal,Regular,11,5,2.5,Yes,Social,1557,...,Cardio,Moderate,10,0,Employee,75,Office,96,Yes,51.16
19998,6.1,8,Normal,Irregular,82,3,3.6,No,Social,1547,...,,Moderate,1,9,Employee,195,Home,96,No,77.86
19999,4.2,0,Normal,Regular,41,3,3.5,Yes,Work,3865,...,Strength,Low,3,1,Freelancer,150,Office,62,No,47.85


In [176]:
df.isna().sum()

sleep_hours                  0
sleep_quality                0
bed_time_period              0
sleep_consistency            0
nap_minutes                  0
sleep_disturbances           0
screen_time_hours            0
late_night_screen            0
app_usage_type               0
daily_calories               0
junk_food_servings           0
healthy_meals_count          0
water_intake_liters          0
diet_type                    0
meal_timings_regular         0
physical_activity_minutes    0
exercise_type                0
activity_intensity           0
stress_level                 0
focus_level                  0
occupation_type              0
outdoor_time_minutes         0
work_environment             0
habit_streak_days            0
reward_indulgence            0
habit_quality_score          0
dtype: int64

In [177]:
# first of all we need to encode the object types
df.dtypes

sleep_hours                  float64
sleep_quality                  int64
bed_time_period               object
sleep_consistency             object
nap_minutes                    int64
sleep_disturbances             int64
screen_time_hours            float64
late_night_screen             object
app_usage_type                object
daily_calories                 int64
junk_food_servings             int64
healthy_meals_count            int64
water_intake_liters          float64
diet_type                     object
meal_timings_regular          object
physical_activity_minutes      int64
exercise_type                 object
activity_intensity            object
stress_level                   int64
focus_level                    int64
occupation_type               object
outdoor_time_minutes           int64
work_environment              object
habit_streak_days              int64
reward_indulgence             object
habit_quality_score          float64
dtype: object

In [178]:
from sklearn.preprocessing import LabelEncoder
lab=LabelEncoder()
df['bed_time_period']=lab.fit_transform(df['bed_time_period'])
df['sleep_consistency']=lab.fit_transform(df['sleep_consistency'])
df['late_night_screen']=lab.fit_transform(df['late_night_screen'])
df['app_usage_type']=lab.fit_transform(df['app_usage_type'])
df['diet_type']=lab.fit_transform(df['diet_type'])
df['meal_timings_regular']=lab.fit_transform(df['meal_timings_regular'])
df['exercise_type']=lab.fit_transform(df['exercise_type'])
df['activity_intensity']=lab.fit_transform(df['activity_intensity'])
df['occupation_type']=lab.fit_transform(df['occupation_type'])
df['work_environment']=lab.fit_transform(df['work_environment'])
df['reward_indulgence']=lab.fit_transform(df['reward_indulgence'])

In [179]:
df

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence,habit_quality_score
0,5.6,5,2,1,82,3,6.6,1,1,3701,...,3,2,1,8,0,192,0,38,0,55.81
1,9.7,1,1,0,93,4,1.4,1,1,3242,...,0,2,10,9,0,181,1,111,1,50.35
2,8.1,3,0,1,83,2,11.6,1,0,2374,...,0,1,10,8,0,40,2,26,0,39.35
3,7.2,6,2,1,51,4,11.7,1,0,2068,...,3,2,1,9,1,221,0,26,1,57.97
4,4.1,10,2,1,19,4,9.2,1,2,1439,...,1,2,6,2,2,103,1,29,1,44.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,9.1,8,2,1,77,0,7.5,1,1,3429,...,1,0,0,4,1,223,0,17,0,71.58
19996,3.3,4,0,1,10,5,3.0,0,1,2080,...,2,2,6,6,2,92,0,96,0,39.06
19997,5.1,10,2,1,11,5,2.5,1,1,1557,...,0,2,10,0,0,75,2,96,1,51.16
19998,6.1,8,2,0,82,3,3.6,0,1,1547,...,1,2,1,9,0,195,0,96,0,77.86


In [180]:
x=df.iloc[:,:-1]
x

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,physical_activity_minutes,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence
0,5.6,5,2,1,82,3,6.6,1,1,3701,...,170,3,2,1,8,0,192,0,38,0
1,9.7,1,1,0,93,4,1.4,1,1,3242,...,40,0,2,10,9,0,181,1,111,1
2,8.1,3,0,1,83,2,11.6,1,0,2374,...,75,0,1,10,8,0,40,2,26,0
3,7.2,6,2,1,51,4,11.7,1,0,2068,...,14,3,2,1,9,1,221,0,26,1
4,4.1,10,2,1,19,4,9.2,1,2,1439,...,77,1,2,6,2,2,103,1,29,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,9.1,8,2,1,77,0,7.5,1,1,3429,...,92,1,0,0,4,1,223,0,17,0
19996,3.3,4,0,1,10,5,3.0,0,1,2080,...,37,2,2,6,6,2,92,0,96,0
19997,5.1,10,2,1,11,5,2.5,1,1,1557,...,160,0,2,10,0,0,75,2,96,1
19998,6.1,8,2,0,82,3,3.6,0,1,1547,...,84,1,2,1,9,0,195,0,96,0


In [181]:
y=df.iloc[:,-1]
y

0        55.81
1        50.35
2        39.35
3        57.97
4        44.42
         ...  
19995    71.58
19996    39.06
19997    51.16
19998    77.86
19999    47.85
Name: habit_quality_score, Length: 20000, dtype: float64

In [182]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [183]:
x_train

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,physical_activity_minutes,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence
17218,7.3,5,0,0,103,5,2.0,0,2,2134,...,99,2,1,2,1,2,30,2,52,0
15188,4.5,9,2,1,106,3,5.9,1,2,3923,...,157,2,0,9,8,1,192,1,104,0
11295,3.1,10,0,1,73,1,12.0,0,1,2784,...,93,3,1,0,5,0,173,2,116,0
19772,3.2,4,1,0,18,2,2.7,0,0,3722,...,55,0,2,7,6,0,229,1,55,0
13072,4.3,4,0,1,85,2,6.6,0,1,3696,...,102,1,0,5,4,2,107,1,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,9.2,9,2,1,36,4,11.1,1,1,2632,...,20,2,1,4,7,1,153,0,49,0
11964,6.8,6,0,1,85,3,6.2,1,1,3233,...,105,0,2,3,10,2,47,2,37,1
5390,4.0,9,0,1,55,2,2.3,0,1,3953,...,156,3,2,3,7,2,133,2,21,1
860,8.4,8,2,1,40,1,10.2,1,2,1685,...,6,1,2,0,8,2,9,0,89,0


In [184]:
y_train

17218    52.01
15188    53.20
11295    61.14
19772    39.04
13072    47.14
         ...  
11284    55.30
11964    66.61
5390     54.76
860      59.81
15795    57.38
Name: habit_quality_score, Length: 14000, dtype: float64

In [185]:
x_test

Unnamed: 0,sleep_hours,sleep_quality,bed_time_period,sleep_consistency,nap_minutes,sleep_disturbances,screen_time_hours,late_night_screen,app_usage_type,daily_calories,...,physical_activity_minutes,exercise_type,activity_intensity,stress_level,focus_level,occupation_type,outdoor_time_minutes,work_environment,habit_streak_days,reward_indulgence
10650,9.6,4,1,1,15,4,10.2,0,2,1444,...,124,0,1,9,10,2,109,0,62,0
2041,6.9,5,0,1,47,1,7.2,1,1,2445,...,81,2,1,5,8,0,94,0,69,1
8668,4.8,10,0,1,46,1,10.7,0,0,2779,...,140,3,1,1,4,0,194,1,42,0
1114,7.2,9,2,0,16,2,7.4,0,0,1704,...,24,0,2,7,0,1,130,2,93,0
13902,7.3,3,0,1,96,1,3.1,1,2,2920,...,21,2,1,4,5,0,82,0,44,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4464,7.7,2,2,1,75,3,6.8,0,2,2435,...,156,1,1,10,0,0,40,2,114,0
15656,5.8,9,1,1,96,5,3.2,1,0,1285,...,11,0,1,2,9,0,237,2,93,1
19146,5.0,4,2,1,32,4,7.5,0,2,3698,...,73,2,0,2,7,0,65,2,55,0
10567,6.0,10,1,0,46,4,11.8,1,2,2060,...,143,1,0,2,4,0,131,0,21,1


In [186]:
y_test

10650    60.80
2041     55.87
8668     62.67
1114     56.05
13902    50.01
         ...  
4464     33.75
15656    74.25
19146    42.05
10567    62.35
9165     79.64
Name: habit_quality_score, Length: 6000, dtype: float64

In [187]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
y_pred

array([60.7959326 , 55.86762539, 62.67428782, ..., 42.05222084,
       62.35139692, 79.64066712], shape=(6000,))

In [188]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)


R2 Score: 0.99999995342867


In [189]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)


MAE: 0.0024880509075574475


In [190]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)


MSE: 8.305321888425161e-06


In [191]:
import numpy as np

rmse = np.sqrt(mse)
print("RMSE:", rmse)


RMSE: 0.0028818955373894387


In [192]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R2 Score: 0.99999995342867
MAE: 0.0024880509075574475
MSE: 8.305321888425161e-06
RMSE: 0.0028818955373894387


In [194]:
#model creation
from sklearn.tree import DecisionTreeRegressor
model1=DecisionTreeRegressor()
model1.fit(x_train,y_train)
y_pred1=model1.predict(x_test)
y_pred1

array([61.28, 58.15, 59.42, ..., 45.61, 67.62, 83.58], shape=(6000,))

In [195]:
decision_r2= r2_score(y_test, y_pred1)
print("R2 Score:",decision_r2)

R2 Score: 0.6510015147838526


In [196]:
decision_mae = mean_absolute_error(y_test, y_pred1)
print("MAE:",decision_mae)

MAE: 6.293276666666667


In [197]:
from sklearn.metrics import mean_squared_error

decision_mse = mean_squared_error(y_test, y_pred1)
print("MSE:",decision_mse)

MSE: 62.23882283333334


In [198]:
import numpy as np

decision_rmse = np.sqrt(mse)
print("RMSE:",decision_rmse)

RMSE: 0.0028818955373894387


In [199]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
random_forest=RandomForestRegressor(n_estimators=100)
random_forest.fit(x_train,y_train)
y_pred2=random_forest.predict(x_test)
y_pred2

array([58.157 , 56.1114, 54.48  , ..., 45.6368, 55.7756, 76.1976],
      shape=(6000,))

In [200]:
random_r2= r2_score(y_test, y_pred2)
print("R2 Score:",random_r2)

R2 Score: 0.892793305163231


In [201]:
random_mae = mean_absolute_error(y_test, y_pred2)
print("MAE:",random_mae)

MAE: 3.5293051999999996


In [202]:
from sklearn.metrics import mean_squared_error

random_mse = mean_squared_error(y_test, y_pred2)
print("MSE:",random_mse)

MSE: 19.11876059393333


In [203]:
import numpy as np

random_rmse = np.sqrt(random_mse)
print("RMSE:",random_rmse)

RMSE: 4.372500496733343
