In [1]:
from pathlib import Path
from functools import reduce
import pandas as pd
import numpy as np
import os
from datetime import datetime

from nanoHUB.application import Application
from nanoHUB.rfm.model import LastUpdateRecord, TempUserDescriptors, UserDescriptors

from sqlalchemy import select
from sqlalchemy.orm import Session

application = Application.get_instance()

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

In [3]:
df = pd.read_parquet(Path(NANOHUB_METRICS_CACHE, 'toolevents'))

In [4]:
display(df)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.0000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.0000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.0000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.4805,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


In [61]:
def filter_nulls(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df[col_name] = df[col_name].str.strip()
    df = df[~df[col_name].isna()]
    return df[df[col_name] != '']


def get_data_for_user(username: str, df: pd.DataFrame):
    return df.loc[df['user'] == username]


def get_data_for_users(usernames: [], df: pd.DataFrame):
    return df.loc[df['user'].isin(usernames)]


class Merger:
    def __init__(self):
        self.dataframes = []
        
    def add_dataframe(self, df: pd.DataFrame):
        self.dataframes.append(df)
        
    def merge(self) -> pd.DataFrame:
        return reduce(lambda left,right: pd.merge(left, right, on=['user'], how='outer'), self.dataframes) 

In [21]:
df_grouped_users = df.groupby(['user'])

In [63]:
df_start_max_min = df_grouped_users.agg(
    first_start_date=('start', np.min), last_start_date=('start', np.max)
).reset_index()

df_finish_max_min = df_grouped_users.agg(
    first_finish_date=('finish', np.min), last_finish_date=('finish', np.max)
).reset_index()

df_jobs = df_grouped_users.agg(
    job_count=('job', 'count')
).reset_index()

df_superjobs = df_grouped_users['superjob'].apply(lambda x: x[x != 0].count()).reset_index(name='superjob_count')

In [None]:
df_dummy = df
df_dummy['datetime'] = pd.to_datetime(df_dummy['start'])
df_dummy['dates'] = df_dummy['datetime'].dt.date

df_count_event_days = df_dummy.groupby(['user']).agg(
    event_days_count=('dates', 'nunique')
).reset_index()

In [None]:
display(df_dummy['dates'])

In [None]:
merger = Merger()

merger.add_dataframe(df_start_max_min)
merger.add_dataframe(df_finish_max_min)
merger.add_dataframe(df_jobs)
merger.add_dataframe(df_superjobs)
merger.add_dataframe(df_count_event_days)

df_merged = merger.merge()
display(df_merged)

In [None]:
df1 = df_merged[
    (df_merged.job_count != df_merged.superjob_count)
    & (df_superjobs.superjob_count != 0)
]
display(df1)

In [27]:

display(df_merged)

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,jobs_count,superjobs_count
0,,2011-11-22 09:44:56,2013-05-28 15:46:14,2005-11-11 00:00:00,2013-05-28 15:46:17,885,885
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22,2017-03-08 10:16:34,2017-03-09 07:22:59,15,15
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56,2016-09-28 20:32:31,2016-10-13 00:08:08,14,14
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56,2013-11-19 01:01:19,2013-11-19 01:04:21,2,2
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36,2017-03-20 09:46:37,2017-03-20 09:46:37,1,1
...,...,...,...,...,...,...,...
155302,zzz121243,2021-05-04 19:04:27,2021-05-04 19:21:22,2021-05-04 19:18:31,2021-05-05 19:37:43,17,17
155303,zzz1ttt,2015-10-21 21:42:30,2015-12-17 06:47:37,2015-10-21 21:44:26,2015-12-17 09:14:10,60,60
155304,zzz777,2016-08-08 15:35:26,2016-08-08 16:05:25,2016-08-08 15:35:56,2016-08-08 16:07:32,9,9
155305,zzzstas,2016-07-13 00:40:03,2016-07-13 06:33:44,2016-07-13 00:53:35,2016-07-13 08:23:48,40,40


In [62]:
display(get_data_for_user('k_dadesh', df_merged))

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,job_count,superjob_count
70032,k_dadesh,2000-07-05 02:46:44,2001-06-06 04:58:00,2000-07-05 02:46:51,2001-06-06 04:58:12,292,0


In [None]:
# df_merged = reduce(lambda left,right: pd.merge(left, right, on=['user'], how='outer'), [
#         df_start_max_min,
#         df_finish_max_min,
#         df_jobs
#     ])

In [None]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
display(df_merged)