In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate an anonymous user ID
user_id = "user_" + ''.join(np.random.choice(list('0123456789ABCDEF'), 8))
np.random.seed(42)  # For reproducibility

# Generate data for "home" chores
home_records = {
    "user_id": [user_id] * 45,
    "total_days_from_input": np.random.randint(1, 10, 45),
    "amount_days_late": np.random.choice([0, 1, 2, 3, 4, 5, 6], 45, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05]),
    "avg_mood_int": np.random.normal(4, 1, 45).round().astype(int),
    "avg_sleep_hours_int": np.random.normal(6.5, 1, 45).round().astype(int),
    "medication_taken": np.random.choice([0, 1], size=45, p=[0.8, 0.2]),
    "chore_type": [0] * 45  # 0 for 'home'
}

# Generate data for "sports" chores
sports_records = {
    "user_id": [user_id] * 20,
    "total_days_from_input": np.random.randint(1, 10, 20),
    "amount_days_late": np.random.randint(1, 7, 20),
    "avg_mood_int": np.random.normal(4, 1, 20).round().astype(int),
    "avg_sleep_hours_int": np.random.normal(6.5, 1, 20).round().astype(int),
    "medication_taken": np.random.choice([0, 1], size=20, p=[0.7, 0.3]),
    "chore_type": [1] * 20  # 1 for 'sports'
}

# Generate data for "fun" activities that the user tends to postpone
fun_records = {
    "user_id": [user_id] * 20,
    "total_days_from_input": np.random.randint(1, 10, 20),
    "amount_days_late": np.random.randint(1, 13, 20),  # Likely to be more late
    "avg_mood_int": np.random.normal(5, 0.5, 20).round().astype(int),
    "avg_sleep_hours_int": np.random.normal(7, 1, 20).round().astype(int),
    "medication_taken": np.random.choice([0, 1], size=20, p=[0.5, 0.5]),
    "chore_type": [2] * 20  # 2 for 'fun'
}

# Combine into a single DataFrame
home_df = pd.DataFrame(home_records)
sports_df = pd.DataFrame(sports_records)
fun_df = pd.DataFrame(fun_records)
df = pd.concat([home_df, sports_df, fun_df], ignore_index=True)

# Adding 'count_times_late_in_this_chore' as a feature
df['count_times_late_in_this_chore'] = df.groupby('chore_type')['amount_days_late'].transform(lambda x: (x > 0).sum())

# Preparing the feature set
X = df[['avg_mood_int', 'avg_sleep_hours_int', 'medication_taken', 'amount_days_late', 'count_times_late_in_this_chore']]
y = df['chore_type']
display(X.head())
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# RandomForestClassifier Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Making predictions
predictions = clf.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, predictions)
feature_importances = clf.feature_importances_

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Feature Importances: {feature_importances}')

Unnamed: 0,avg_mood_int,avg_sleep_hours_int,medication_taken,amount_days_late,count_times_late_in_this_chore
0,5,5,1,0,22
1,4,5,1,5,22
2,3,6,0,0,22
3,4,7,0,2,22
4,5,6,0,0,22


Accuracy: 92.31%
Feature Importances: [0.14631238 0.06836239 0.09314577 0.18894827 0.5032312 ]


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Hypothetical future task based on average values seen in the dataset
future_task = {
    'avg_mood_int': [int(np.mean(df['avg_mood_int']))],
    'avg_sleep_hours_int': [int(np.mean(df['avg_sleep_hours_int']))],
    'medication_taken': [int(np.round(np.mean(df['medication_taken'])))],
    'amount_days_late': [int(np.mean(df['amount_days_late']))],
    'count_times_late_in_this_chore': [int(np.mean(df['count_times_late_in_this_chore']))]
}
future_task_df = pd.DataFrame(future_task)

# Predicting the next task type the user is likely to postpone
future_task_prediction = clf.predict(future_task_df)
predicted_chore_type = {0: 'home', 1: 'sports', 2: 'fun'}[future_task_prediction[0]]

print(f"The next task the user is likely to postpone is: {predicted_chore_type}")


The next task the user is likely to postpone is: sports



**Random Forest** is a highly compatible model for this task due to several of its characteristics that align well with the nature of the prediction problem at hand. Here's why Random Forest is a suitable choice:

**Handling of Categorical and Numerical Features**: Random Forest can seamlessly handle a mix of categorical and numerical features without the need for extensive preprocessing. In the given dataset, features like mood, sleep hours, medication intake, and counts of late instances vary in nature (numerical counts and binary flags), making Random Forest an apt choice.

**Robustness to Overfitting**: Thanks to the ensemble learning approach, where multiple decision trees vote on the outcome, Random Forest is generally more robust to overfitting compared to individual decision trees. This is especially beneficial in scenarios where the dataset might not be very large or when there's complexity in the data patterns, as it's likely the case with predicting behaviors based on mood, sleep, and other personal factors.

**Feature Importance**: Random Forest provides a straightforward way to understand which features are most influential in predicting the outcome. This insight is valuable for interpreting the model in the context of predicting chore postponement, as it can reveal how different factors (like mood or sleep hours) weigh in the decision-making process.

**Flexibility and Non-linearity:** Random Forest can capture non-linear relationships between features without the need for transforming the data. This capability is crucial for modeling human behavior, which is often non-linear and influenced by a complex interplay of factors.

**Versatility:** Random Forest can be used for both classification and regression tasks. In this case, predicting whether a user will postpone a chore (a classification task) can be straightforwardly handled by the model.

In summary, Random Forest's ability to handle a variety of data types, its robustness to overfitting, and its ease of use make it a compatible and often preferred choice for tasks involving the prediction of categorical outcomes based on a set of features that might exhibit complex relationships.

**************** למטה טיוטה***************

**Data Structure Explanation**

**Tasks Input by User:** Each task input by the user must include:

  Tag (Category): The category or type of the task (e.g., home, sports, fun).

  Time Estimation: Estimated time required to complete the task.

  Deadline: The date by which the task should be completed.

  About: A description or additional details about the task.

  In-Progress Tasks: Once entered, tasks are saved in an "in-progress" column as JSON objects. This column captures tasks that are currently active or not yet started.

**Task Completion:** When the user starts working on a task, they mark the "start time" and, upon completion, the "end time" (the timestamps are recorded automatically). It's also tracked whether the deadline for the task was met.

**Done Tasks:** Tasks that have been completed are moved to a "done" column, which includes all the initial information plus:

  Deadline Met: Indicates whether the task was completed by the deadline.

  Start Time: When the user started the task.

  End Time: When the user finished the task.



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Sample tasks structured as JSON-like strings for CSV compatibility
in_progress_str = '[{"tag": "home", "time_estimation": "2 hours", "deadline": "2024-03-25", "about": "Clean the house"}, {"tag": "fun", "time_estimation": "3 hours", "deadline": "2024-03-27", "about": "Watch a movie"}]'
done_str = '[{"tag": "home", "time_estimation": "2 hours", "deadline": "2024-03-20", "about": "Laundry", "deadline_met": true, "start": "2024-03-20 09:00", "end": "2024-03-20 11:00"}, {"tag": "sports", "time_estimation": "1 hour", "deadline": "2024-03-21", "about": "Morning jog", "deadline_met": false, "start": "2024-03-21 08:00", "end": "2024-03-21 09:30"}]'

# Generate an anonymous user ID
user_id = "user_" + ''.join(np.random.choice(list('0123456789ABCDEF'), 8))

# Creating a DataFrame to represent user tasks
df = pd.DataFrame({
    'user_id': [user_id],
    'in_progress': [in_progress_str],
    'done': [done_str]
})

# Define the file path
file_path = 'user_tasks.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)

display(df)

Unnamed: 0,user_id,in_progress,done
0,user_62454A20,"[{""tag"": ""home"", ""time_estimation"": ""2 hours"",...","[{""tag"": ""home"", ""time_estimation"": ""2 hours"",..."


In [None]:
import numpy as np
import pandas as pd
import json
from datetime import datetime, timedelta

np.random.seed(42)  # For reproducibility

tags = ["home", "bureaucracy", "studies", "work", "exercise", "fun", "health"]
num_users = 5  # Number of users to generate data for
tasks_per_user = 10  # Number of tasks to generate per user

# Initialize a list to store user task data
users_tasks_data = []

for user_index in range(num_users):
    # Generate an anonymous user ID
    user_id = "user_" + ''.join(np.random.choice(list('0123456789ABCDEF'), 8))

    in_progress_tasks_dict = {}
    done_tasks_dict = {}

    for task_index in range(tasks_per_user):
        tag = np.random.choice(tags)
        time_estimation = f"{np.random.randint(1, 4)} hours"
        deadline = (datetime.now() + timedelta(days=np.random.randint(1, 10))).strftime("%Y-%m-%d")
        about = f"Task description for {tag}"
        deadline_met = bool(np.random.choice([True, False]))  # Convert NumPy boolean to Python boolean
        start = datetime.now().strftime("%Y-%m-%d %H:%M")
        end = (datetime.now() + timedelta(hours=np.random.randint(1, 4))).strftime("%Y-%m-%d %H:%M")
        in_progress_or_done = np.random.choice(["in_progress", "done"])

        task = {
            "tag": tag,
            "time_estimation": time_estimation,
            "deadline": deadline,
            "about": about,
            "deadline_met": deadline_met,  # Now as a Python boolean
            "start": start,
            "end": end
        }

        if in_progress_or_done == "done":
            done_tasks_dict[task_index] = task
        else:
            in_progress_tasks_dict[task_index] = task

    # Convert task dictionaries to JSON-like strings for each user
    in_progress_str = json.dumps(in_progress_tasks_dict)
    done_str = json.dumps(done_tasks_dict)

    # Append the user's tasks to the users_tasks_data list
    users_tasks_data.append({"user_id": user_id, "in_progress": in_progress_str, "done": done_str})

# Convert the users tasks data to a DataFrame
df = pd.DataFrame(users_tasks_data)

# Show the DataFrame
display(df)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Unnamed: 0,user_id,in_progress,done
0,user_63CEA7C4,"{""0"": {""tag"": ""health"", ""time_estimation"": ""2 ...","{""1"": {""tag"": ""exercise"", ""time_estimation"": ""..."
1,user_31D55935,"{""4"": {""tag"": ""work"", ""time_estimation"": ""1 ho...","{""0"": {""tag"": ""exercise"", ""time_estimation"": ""..."
2,user_75A2024E,"{""0"": {""tag"": ""fun"", ""time_estimation"": ""3 hou...","{""5"": {""tag"": ""studies"", ""time_estimation"": ""2..."
3,user_7A8D4029,"{""1"": {""tag"": ""home"", ""time_estimation"": ""2 ho...","{""0"": {""tag"": ""work"", ""time_estimation"": ""3 ho..."
4,user_3BA3292E,"{""5"": {""tag"": ""exercise"", ""time_estimation"": ""...","{""0"": {""tag"": ""studies"", ""time_estimation"": ""3..."


In [None]:
# Assuming users_tasks_data is a list of dictionaries, where each dictionary contains a user's tasks data
# Extract user_ids from the list of dictionaries
user_ids = [task_data['user_id'] for task_data in users_tasks_data]

# Now, you can work with the unique user_ids directly
unique_user_ids = np.unique(user_ids)

# Continuing with the original task, generating daily inputs
daily_data = []
dates = pd.date_range(start="2024-01-01", periods=num_days, freq='D')

for user_id in unique_user_ids:
    for date in dates:
        mood = np.random.randint(1, 6)  # Assuming mood is rated on a scale of 1 to 5
        sleep_hours = np.random.normal(7, 2)  # Assuming average sleep hours is 7 with some variability
        took_adhd_medication = bool(np.random.choice([True, False]))

        daily_entry = {
            "user_id": user_id,
            "date": date,
            "mood": mood,
            "sleep_hours": round(sleep_hours, 1),
            "took_adhd_medication": took_adhd_medication
        }

        daily_data.append(daily_entry)

# Convert the daily data into a DataFrame
daily_df = pd.DataFrame(daily_data)

# Setting 'user_id' and 'date' as a MultiIndex
daily_df.set_index(['user_id', 'date'], inplace=True)

# If you need to save or display the DataFrame, you can continue as before
daily_df

Unnamed: 0_level_0,Unnamed: 1_level_0,mood,sleep_hours,took_adhd_medication
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user_31D55935,2024-01-01,2,9.1,True
user_31D55935,2024-01-02,4,9.7,True
user_31D55935,2024-01-03,4,5.5,False
user_31D55935,2024-01-04,3,4.7,True
user_31D55935,2024-01-05,1,8.3,False
...,...,...,...,...
user_7A8D4029,2024-01-26,4,5.1,False
user_7A8D4029,2024-01-27,1,6.9,True
user_7A8D4029,2024-01-28,2,8.9,True
user_7A8D4029,2024-01-29,2,8.2,False


In [7]:
from datetime import date,datetime, timedelta
import numpy as np

# Get the current date and time
current_datetime = datetime.now()

current_datetime

datetime.datetime(2024, 4, 2, 11, 39, 10, 179946)

In [2]:
deadline = (datetime.now() + timedelta(days=np.random.randint(1, 10))).strftime("%Y-%m-%d")
deadline

'2024-04-08'

In [3]:
now = datetime.now()
 
print("now =", now)

now = 2024-04-02 11:25:48.589935


In [5]:
type(now)

datetime.datetime

In [10]:
deadline = date.today()
print("today =", deadline)

today = 2024-04-02


In [14]:
deadline_datetime = datetime.combine(deadline, datetime.max.time())

In [15]:
deadline_datetime

datetime.datetime(2024, 4, 2, 23, 59, 59, 999999)

In [16]:
print("deadline_datetime =", deadline_datetime)

deadline_datetime = 2024-04-02 23:59:59.999999
