In [1]:
from pathlib import Path
from functools import reduce
import pandas as pd
import numpy as np
import os
from datetime import datetime

from nanoHUB.application import Application
from nanoHUB.rfm.model import LastUpdateRecord, TempUserDescriptors, UserDescriptors

from sqlalchemy import select
from sqlalchemy.orm import Session

application = Application.get_instance()

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

In [3]:
df = pd.read_parquet(Path(NANOHUB_METRICS_CACHE, 'toolevents'))

In [4]:
dfc = df
display(dfc)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.0000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.0000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.0000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.4805,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


In [5]:
def filter_nulls(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df[col_name] = df[col_name].str.strip()
    df = df[~df[col_name].isna()]
    return df[df[col_name] != '']


def get_data_for_user(username: str, df: pd.DataFrame):
    return df.loc[df['user'] == username]


def get_data_for_users(usernames: [], df: pd.DataFrame):
    return df.loc[df['user'].isin(usernames)]


class Merger:
    def __init__(self):
        self.dataframes = []
        
    def add_dataframe(self, df: pd.DataFrame):
        self.dataframes.append(df)
        
    def merge(self) -> pd.DataFrame:
        return reduce(lambda left,right: pd.merge(left, right, on=['user'], how='outer'), self.dataframes) 

In [6]:
dfc['start_datetime'] = pd.to_datetime(dfc['start'])
dfc['start_dates'] = dfc['start_datetime'].dt.date

df_grouped_users = dfc.groupby(['user'])

In [7]:
df_start_max_min = df_grouped_users.agg(
    first_start_date=('start', np.min), last_start_date=('start', np.max)
).reset_index()

df_finish_max_min = df_grouped_users.agg(
    first_finish_date=('finish', np.min), last_finish_date=('finish', np.max)
).reset_index()

df_jobs = df_grouped_users.agg(
    job_count=('job', 'count')
).reset_index()

In [8]:
df_count_event_days = df_grouped_users.agg(
    job_event_days=('start_dates', 'nunique')
).reset_index()

In [9]:
df_nonzero_superjobs = dfc[dfc['superjob']!= 0]

In [10]:
df_superjobs_grouped_users = df_nonzero_superjobs.groupby(['user'])

In [11]:
df_superjobs_count = df_superjobs_grouped_users.agg(
    superjob_count=('superjob', 'nunique')
).reset_index()

In [12]:
df_superjobs_days = df_nonzero_superjobs.groupby(['user', 'superjob']).first()
df_superjobs_days_groupedby_user = df_superjobs_days.groupby(['user'])

df_count_superjobs_days = df_superjobs_days_groupedby_user.agg(
    superjob_event_days=('start_dates', 'nunique')
).reset_index()
display(df_count_superjobs_days)

Unnamed: 0,user,superjob_event_days
0,02junho,2
1,082080112abk,10
2,0kt0pus.dr34m,1
3,14zac2,2
4,150_cm,1
...,...,...
9659,zzaa,1
9660,zzeng,4
9661,zzhan163,1
9662,zzhiheng,1


In [13]:
df_superjobs_last_start_date = df_superjobs_days_groupedby_user['start'].last().reset_index()
df_superjobs_last_start_date.columns = ['user', 'last_superjob_start_date']
display(df_superjobs_last_start_date)

Unnamed: 0,user,last_superjob_start_date
0,02junho,2018-08-04 10:01:18
1,082080112abk,2011-12-06 23:32:21
2,0kt0pus.dr34m,2020-12-25 10:13:56
3,14zac2,2021-07-19 06:09:53
4,150_cm,2018-11-24 21:53:01
...,...,...
9659,zzaa,2009-05-06 06:57:21
9660,zzeng,2017-10-11 10:03:17
9661,zzhan163,2020-04-20 12:36:06
9662,zzhiheng,2016-10-31 17:07:02


In [14]:
merger = Merger()

merger.add_dataframe(df_start_max_min)
merger.add_dataframe(df_finish_max_min)
merger.add_dataframe(df_jobs)
merger.add_dataframe(df_superjobs_count)
merger.add_dataframe(df_count_event_days)
merger.add_dataframe(df_count_superjobs_days)
merger.add_dataframe(df_superjobs_last_start_date)

df_merged = merger.merge()

In [15]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
df_merged.loc[df_merged.toolevents__lifetime == 0, 'toolevents__lifetime'] = 1
display(df_merged)

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,job_count,superjob_count,job_event_days,superjob_event_days,last_superjob_start_date,toolevents__lifetime
0,,2011-11-22 09:44:56,2013-05-28 15:46:14,2005-11-11 00:00:00,2013-05-28 15:46:17,885,,10,,NaT,553
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22,2017-03-08 10:16:34,2017-03-09 07:22:59,15,,2,,NaT,1
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56,2016-09-28 20:32:31,2016-10-13 00:08:08,14,,5,,NaT,14
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56,2013-11-19 01:01:19,2013-11-19 01:04:21,2,,1,,NaT,1
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36,2017-03-20 09:46:37,2017-03-20 09:46:37,1,,1,,NaT,1
...,...,...,...,...,...,...,...,...,...,...,...
155302,zzz121243,2021-05-04 19:04:27,2021-05-04 19:21:22,2021-05-04 19:18:31,2021-05-05 19:37:43,17,,1,,NaT,1
155303,zzz1ttt,2015-10-21 21:42:30,2015-12-17 06:47:37,2015-10-21 21:44:26,2015-12-17 09:14:10,60,,6,,NaT,56
155304,zzz777,2016-08-08 15:35:26,2016-08-08 16:05:25,2016-08-08 15:35:56,2016-08-08 16:07:32,9,,1,,NaT,1
155305,zzzstas,2016-07-13 00:40:03,2016-07-13 06:33:44,2016-07-13 00:53:35,2016-07-13 08:23:48,40,,1,,NaT,1


In [16]:
df1 = df_merged[
    (df_merged.job_count != df_merged.superjob_count)
    & (df_superjobs_count.superjob_count != 'nan')
]
display(df1)

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,job_count,superjob_count,job_event_days,superjob_event_days,last_superjob_start_date,toolevents__lifetime
0,,2011-11-22 09:44:56,2013-05-28 15:46:14,2005-11-11 00:00:00,2013-05-28 15:46:17,885,,10,,NaT,553
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22,2017-03-08 10:16:34,2017-03-09 07:22:59,15,,2,,NaT,1
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56,2016-09-28 20:32:31,2016-10-13 00:08:08,14,,5,,NaT,14
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56,2013-11-19 01:01:19,2013-11-19 01:04:21,2,,1,,NaT,1
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36,2017-03-20 09:46:37,2017-03-20 09:46:37,1,,1,,NaT,1
...,...,...,...,...,...,...,...,...,...,...,...
9659,am2235,2020-12-08 12:03:20,2020-12-08 12:03:27,2020-12-08 12:06:21,2020-12-08 12:06:26,2,,1,,NaT,1
9660,am3107,2013-10-22 19:09:13,2013-12-05 00:35:25,2013-10-22 19:09:13,2013-12-05 01:09:38,35,,6,,NaT,43
9661,am5080,2020-09-18 08:46:26,2020-11-25 16:10:59,2020-09-18 09:01:48,2020-11-26 17:08:45,129,,8,,NaT,69
9662,am624138,2020-02-10 21:00:00,2020-02-10 21:00:00,2020-02-11 21:03:41,2020-02-11 21:03:41,230,,1,,NaT,1


In [17]:
users = ['ssahmed', 'k_dadesh']
display(get_data_for_users(users, df_merged))

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,job_count,superjob_count,job_event_days,superjob_event_days,last_superjob_start_date,toolevents__lifetime
70032,k_dadesh,2000-07-05 02:46:44,2001-06-06 04:58:00,2000-07-05 02:46:51,2001-06-06 04:58:12,292,,41,,NaT,336
131665,ssahmed,2005-08-22 13:51:19,2021-07-09 12:59:57,2005-08-22 13:51:18,2021-07-12 07:47:35,8580,69.0,818,33.0,2021-04-01 21:06:08,5802


In [18]:
display(len(dfc[dfc.user == 'ssahmed']))

8580

In [19]:
file_name = 'toolevents_rfm.csv'

df_merged.to_csv(Path(CACHE_DIR, file_name))

In [None]:
%%capture 

bucket_name = 'nanohub_processed_data'
full_path = 'gs://%s/%s' % (bucket_name, file_name)
df_merged.to_csv(full_path, storage_options={"token": "nanohub-320518-a9f4878b9ea2.json"})