Disclaimer: some of the code here reuses code Nathan submitted for CMPT 353, Excercise 11

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime, timezone, timedelta
from custom_process_domain import process_domain_normal
import matplotlib.pyplot as plt
from zoneinfo import ZoneInfo


from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR



## User Input

In [None]:
POSSIBLE_USERS = { # Just to make sure we're good
  "Juan": "Juan",
  "Nathan": "Nathan",
  "Sanyam": "Sanyam",
  "Nipun": "Nipun",
}

# User Input
USER = POSSIBLE_USERS["Nathan"]

# START_DATE
START_DATE_YEAR = 2022
START_DATE_MONTH = 9
START_DATE_DAY = 26

WEEKS_TO_INCREMENT = -1 # Add WEEKS_TO_INCREMENT * 7 days to START_TIME
DATE_INTERVAL_TYPE = 'month' # 'weekend', 'week', 'all_week', 'all_time

##############################
# Graphs
##############################

# Save Graphs
SAVE_GRAPH_1 = False
SAVE_GRAPH_2 = False
SAVE_GRAPH_3 = True

##############################
# Options
##############################
MIN_CHROME_VISIT_DURATION = 5
MAX_CHROME_VISIT_DURATION = 2.88e+10 # 8 hours in microseconds

MIN_SAFARI_SCORE = 5

### Config per User

In [None]:
if USER == "Nathan":
  DB_FILE = "../data/history--2022-10-21--Nathan-Tsai.sqlite3"
  RATING_FILE = "../rated-output/Ratings - CMPT 353 - Nathan Tsai (1).csv"
  USER_TIMEZONE_STRING = 'US/Pacific'
elif USER == "Juan":
  DB_FILE = "../data/history--2022-10-21--Juan-Gonzalez.sqlite3"
  RATING_FILE = "../rated-output/Ratings - CMPT 353 - Juan Gonzalez.csv"
  USER_TIMEZONE_STRING = 'US/Pacific'
elif USER == "Sanyam":
  DB_FILE = "../data/history--2022-11-25--Sanyam-Safari.db"
  RATING_FILE = "../rated-output/Ratings of Inputted Users - CMPT 353 - Sanyam (2).csv"
  USER_TIMEZONE_STRING = 'US/Pacific'
elif USER == "Nipun":
  DB_FILE = "../data/history--2022-11-26--Nipun-Safari.db"
  RATING_FILE = "../rated-output/Ratings of Inputted Users - CMPT 353 - Nipun.csv"
  USER_TIMEZONE_STRING = 'Asia/Colombo' # India standard time: UTC+5:30
else:
  assert(False and "User name is invalid")

USER_TIMEZONE = ZoneInfo(USER_TIMEZONE_STRING)
TIMEZONE_STRING = USER_TIMEZONE_STRING


START_TIME = datetime(
  START_DATE_YEAR,
  START_DATE_MONTH,
  START_DATE_DAY, 
  tzinfo=USER_TIMEZONE
) # the data to look at. Should be a Monday

# The values per user productivity
productivity_to_int_map = {
  'Always Distracted' : -2,
  'Mostly Distracted' : -1,
  'Neutrel' : 0,
  'Mostly Intentional' : 1,
  'Always Intentional' : 2,
}

## Archive

### Format User Input for Graphs / Files

In [None]:
# Increment by a number of weeks
START_TIME = START_TIME + timedelta(days=7 * WEEKS_TO_INCREMENT)

# Add 1 day if weekend, otherwise use weekday
date_types = {
  'weekend': 1,
  'week': 4,
  'all_week': 6,
  'month': 30,
  'all_time': 7 * 52 * 3, # 3 years
}
DAYS_TO_ADD = date_types[DATE_INTERVAL_TYPE]

if DATE_INTERVAL_TYPE == 'weekend':
  START_TIME = START_TIME + timedelta(days=-2)

END_TIME = START_TIME + timedelta(days=DAYS_TO_ADD)


In [None]:
if DATE_INTERVAL_TYPE == 'all_time':
  start_date_for_graph = START_TIME.strftime('%a, %b %d %Y')
else:
  start_date_for_graph = START_TIME.strftime('%a, %b %d')

  
end_date_for_graph = END_TIME.strftime('%a, %b %d, %Y')
time_for_graph = "{} - {}".format(start_date_for_graph, end_date_for_graph)
CUSTOM_PARAMS_FOR_GRAPH = "{}: {}".format(USER, time_for_graph)

start_date_for_file = START_TIME.strftime('%b-%d-%Y')
end_date_for_file = END_TIME.strftime('%b-%d-%Y')
time_for_file = "{}--{}".format(start_date_for_file, end_date_for_file)
CUSTOM_PARAMS_FOR_FILE = "{}--{}".format(USER, time_for_file)

In [None]:
# Constants
WINDOWS_EPOCH_MICROSECS      = -11644473600000 * 1000
SAFARI_TIME_UPDATE           = 978307200

In [None]:
# Process the type of Database
TYPE_IS_SAFARI = "TYPE_IS_SAFARI"
TYPE_IS_CHROME = "TYPE_IS_CHROME"
TYPE_OF_DB = {
  "db": TYPE_IS_SAFARI,
  "sqlite": TYPE_IS_CHROME,
  "sqlite3": TYPE_IS_CHROME,
}

extension = DB_FILE.split(".")[-1]
DB_TYPE = TYPE_OF_DB[extension]

### Helpful Functions

In [None]:
NUMBER_OF_INCREMENTS = 24 * 2
DAY_IN_SECONDS = 60 * 60 * 24
ROUND_TO = DAY_IN_SECONDS / NUMBER_OF_INCREMENTS

def get_half_hour(data: pd.Series) -> pd.Series:
  return (data / ROUND_TO).round().astype(np.int32)

### Bunch of Processing

#### Load History Database

In [None]:
if DB_TYPE == TYPE_IS_CHROME:
  query_get_urls_and_times = """
  SELECT v.id, v.visit_time, v.visit_duration, u.url
  FROM 'visits' as v 
  LEFT JOIN urls u ON u.id = v.url
  """

  TIME_CORRECTION_TO_ADD_TO_VISIT_TIME = WINDOWS_EPOCH_MICROSECS
elif DB_TYPE == TYPE_IS_SAFARI:
  query_get_urls_and_times = """
  SELECT v.id, v.visit_time, v.score, u.url
  FROM 'history_visits' as v 
  LEFT JOIN history_items u ON u.id = v.history_item
  """
  TIME_CORRECTION_TO_ADD_TO_VISIT_TIME = SAFARI_TIME_UPDATE
else:
  assert(False and "Extension of database is invalid")

In [None]:
with sqlite3.connect(DB_FILE) as con:
  visits = pd.read_sql_query(query_get_urls_and_times, con)

visits.head()

#### Process the time

In [None]:
# Visit time in microseconds (s/1,000,000)
# https://chromium.googlesource.com/chromium/src/+/lkgr/base/time/time.h

if DB_TYPE == TYPE_IS_CHROME:
  visit_time_in_ns = (visits['visit_time'] + TIME_CORRECTION_TO_ADD_TO_VISIT_TIME) * 1000
  visits['visit_time_epoch'] = pd.to_datetime(visit_time_in_ns, unit='ns', utc=True).map(lambda x: x.tz_convert(TIMEZONE_STRING))
elif DB_TYPE == TYPE_IS_SAFARI:
  visit_time_in_ns = (visits['visit_time'] + TIME_CORRECTION_TO_ADD_TO_VISIT_TIME)
  visits['visit_time_epoch'] = pd.to_datetime(visit_time_in_ns, unit='s', utc=True).map(lambda x: x.tz_convert(TIMEZONE_STRING))
else:
  assert(False and "Extension of database is invalid")

#### Process the domains

In [None]:
visits['domain'] = visits['url'].apply(process_domain_normal)

visits

In [None]:
visits.tail(n=10)

#### Load the Ratings

In [None]:
ratings = pd.read_csv(RATING_FILE, index_col='domain')
ratings.head()

#### Filter the Data based on Time + Visit Duration/Score

In [None]:
greaterthanStartTime = visits['visit_time_epoch'] >= START_TIME
lessThanEndTime = visits['visit_time_epoch'] <= END_TIME
visits_this_semester = visits[greaterthanStartTime & lessThanEndTime]

In [None]:
if 'visit_duration' in visits_this_semester.columns:
  min_visit_option = visits_this_semester[visits_this_semester['visit_duration'] >= MIN_CHROME_VISIT_DURATION]
  min_visit_option = min_visit_option[min_visit_option['visit_duration'] < MAX_CHROME_VISIT_DURATION]
elif 'score' in visits_this_semester.columns:
  min_visit_option = visits_this_semester[visits_this_semester['score'] >= MIN_SAFARI_SCORE]
else:
  min_visit_option = visits_this_semester

In [None]:

final_data = min_visit_option.join(ratings[['manual_rating']], on="domain", how='left')
final_data['productivity_scale'] = final_data['manual_rating'].map(productivity_to_int_map)
final_data['time_of_day'] = final_data['visit_time_epoch'].dt.hour * 60 * 60 + final_data['visit_time_epoch'].dt.minute * 60 + final_data['visit_time_epoch'].dt.second
final_data.head()

#### Fill empty productivity scores with 0

In [None]:
# Remove N/A
refined_data = final_data.copy()
refined_data['productivity_scale'] = refined_data['productivity_scale'].fillna(0)

#### Account for Visit Duration

In [None]:
HALF_HOUR = 60 * 30

refined_data['adjusted_time'] = refined_data[['visit_duration', 'visit_time_epoch']].apply(
  lambda x: np.arange(0, x['visit_duration'] // 1e6, HALF_HOUR) + x['visit_time_epoch'].value // 1e9, axis=1)

adjusted_data = refined_data.explode('adjusted_time')
adjusted_data.dropna(subset=['adjusted_time'], inplace=True)


In [None]:
adjusted_data['adjusted_datetime'] = pd.to_datetime(adjusted_data['adjusted_time'], unit='s', utc=True).map(lambda x: x.tz_convert(TIMEZONE_STRING))

adjusted_data['time_of_day'] = adjusted_data['adjusted_datetime'].dt.hour * 60 * 60 \
  + adjusted_data['adjusted_datetime'].dt.minute * 60 \
  + adjusted_data['adjusted_datetime'].dt.second

adjusted_data['half_hour'] = get_half_hour(adjusted_data['time_of_day'])

## Linear Regression

In [None]:
adjusted_data['adjusted_prod'] = adjusted_data['productivity_scale']
positive_count = adjusted_data[adjusted_data['productivity_scale'] > 0].size
adjusted_data.loc[adjusted_data['productivity_scale'] > 0, 'adjusted_prod'] /= positive_count

negative_count = adjusted_data[adjusted_data['productivity_scale'] < 0].size
adjusted_data.loc[adjusted_data['productivity_scale'] < 0, 'adjusted_prod'] /= negative_count

# adjusted_data_agg = adjusted_data.groupby('half_hour').agg(sum).reset_index()


In [None]:
X = adjusted_data[['half_hour']]
y = adjusted_data['adjusted_prod']

X_train, X_valid, y_train, y_valid = train_test_split(X, y)


In [None]:
models_pre = {}

models_pre['kneighbours_model'] = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=10)
)

models_pre["rf_model_12_20"] = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(
    ),
)


In [None]:
models = pd.DataFrame({
  'name': models_pre.keys(),
  'model': models_pre.values(),
})

models['model_trained'] = models['model'].apply(lambda x : x.fit(X_train, y_train))

In [25]:
def print_model_scores(row):
  print(row['name'])
  score_train = row['model_trained'].score(X_train, y_train)
  score_val = row['model_trained'].score(X_valid, y_valid)

  return (score_train, score_val)

models['score'] = models.apply(print_model_scores, axis=1)
models

kneighbours_model
rf_model_12_20


Unnamed: 0,name,model,model_trained,score
0,kneighbours_model,"(StandardScaler(), KNeighborsRegressor(n_neigh...","(StandardScaler(), KNeighborsRegressor(n_neigh...","(1.0, 1.0)"
1,rf_model_12_20,"(StandardScaler(), (DecisionTreeRegressor(max_...","(StandardScaler(), (DecisionTreeRegressor(max_...","(1.0, 1.0)"


In [26]:
x_data = np.arange(0, 48)

x_df = pd.DataFrame({
  "half_hour": x_data
})
y_data = models.loc[2, 'model_trained'].predict(x_df)

KeyError: 2

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(x_df['half_hour'], y_data, 'r-')
plt.plot(x_df['half_hour'], y_data, 'r.')
plt.axhline(y=0, color='b', linestyle='--', alpha=0.5)
plt.xticks(np.arange(0, NUMBER_OF_INCREMENTS, NUMBER_OF_INCREMENTS/24), np.arange(0, 24, step=1)) # Half Hours
plt.title("Estimated productivity using Random Forest: \n ({})".format(CUSTOM_PARAMS_FOR_GRAPH))
plt.xlabel("Time (Hour)")
plt.ylabel("Sum of Productivity Level (-1 for Distracted. -2 for Very Distracted)")

if SAVE_GRAPH_3:
  plt.savefig("../plots/XX-ml--SOME-MODEL-predictions--{}".format(CUSTOM_PARAMS_FOR_FILE))