In [1]:
from pathlib import Path
from functools import reduce
import pandas as pd
import numpy as np
import os
from datetime import datetime

from nanoHUB.application import Application
from nanoHUB.rfm.model import LastUpdateRecord, TempUserDescriptors, UserDescriptors

from sqlalchemy import select
from sqlalchemy.orm import Session

application = Application.get_instance()

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [11]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

In [30]:
def get_toolevents_df() -> pd.DataFrame:
    return pd.read_parquet(Path(NANOHUB_METRICS_CACHE, 'toolevents'))

def filter_nulls(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df[col_name] = df[col_name].str.strip()
    df = df[~df[col_name].isna()]
    return df[df[col_name] != '']

In [13]:
df = get_toolevents_df()
display(df)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.0000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.0000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.0000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.4805,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


In [14]:
display(df.superjob != 0)

0             True
1             True
2             True
3             True
4             True
             ...  
200150852    False
200150853     True
200150854     True
200150855     True
200150856    False
Name: job, Length: 200150857, dtype: bool

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150850,200399645,from joblog,8424133,0,1861987,/probercacsite.sh,2021-08-07 01:00:08,2021-08-07 01:02:34,gridstat,,0.000000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:02:34
200150851,200399646,from joblog,8422264,0,1873253,start_jupyter,2021-08-06 01:04:11,2021-08-07 01:04:13,sachinkumarsaid01,mos_r14,86402.036749,11.993140,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:13
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.000000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.000000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55


In [18]:
df1 = df[(df.superjob != 0) & (~df['tool'].isin(['nanowire', '1dhetero_r742']))]
display(df1)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
644049,644050,from joblog,575,574,60473,/nanowire.tganl64,2007-05-08 11:53:32,2007-05-08 13:33:53,clarksm,workspace-med,6021.0,143550.687500,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 13:33:53
644050,644051,from joblog,575,574,60473,[waiting],2007-05-08 11:53:32,2007-05-08 11:53:50,clarksm,workspace-med,18.0,0.000000,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 11:53:50
644051,644052,from joblog,576,574,60473,/nanowire.tganl64,2007-05-08 11:53:37,2007-05-08 13:38:46,clarksm,workspace-med,6309.0,150279.546875,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 13:38:46
644052,644053,from joblog,576,574,60473,[waiting],2007-05-08 11:53:37,2007-05-08 11:53:51,clarksm,workspace-med,14.0,0.000000,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 11:53:51
644054,644055,from joblog,578,577,60473,/nanowire.tganl64,2007-05-08 14:19:11,2007-05-08 15:59:23,clarksm,workspace-med,6012.0,143488.312500,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 15:59:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150589,200399384,from joblog,8424047,8423431,1873627,:lammps-09Dec14-parallel:,2021-08-06 21:33:50,2021-08-06 21:35:38,johanmcastillo24,,3.0,2.120000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:35:38
200150594,200399389,from joblog,8424048,8423431,1873627,[waiting],2021-08-06 21:35:41,2021-08-06 21:37:50,johanmcastillo24,,8.0,0.000000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:37:50
200150595,200399390,from joblog,8424048,8423431,1873627,:lammps-09Dec14-parallel:,2021-08-06 21:35:41,2021-08-06 21:37:50,johanmcastillo24,,4.0,1.920000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:37:50
200150822,200399617,from joblog,8423463,8423462,1872710,[waiting],2021-08-06 16:21:52,2021-08-07 00:17:14,carlheinzcb,,13195.0,0.000000,49.922620,4.082590,Hirson,Picardie,France,FR,2021-08-07 00:17:14


In [20]:
all_sources = df["source"].unique()
display(all_sources)

array(['from toolstart', 'from joblog'], dtype=object)

In [22]:
all_superjobs = df["superjob"].unique()
display(all_superjobs)

array([      0,    1218,    1325, ..., 8423979, 8424040, 8423462])

In [23]:
df1 = df[(df.superjob == 1218)]
display(df1)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
639377,639378,from joblog,1219,1218,63842,/nanowire,2007-05-25 10:33:33,2007-05-25 10:38:26,clarksm,nanowire,293.0,292.279999,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:26
639378,639379,from joblog,1219,1218,63842,[waiting],2007-05-25 10:33:33,2007-05-25 10:36:58,clarksm,nanowire,205.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:36:58
639379,639380,from joblog,1220,1218,63842,/nanowire,2007-05-25 10:33:38,2007-05-25 11:34:53,clarksm,nanowire,3675.0,3623.209961,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 11:34:53
639380,639381,from joblog,1220,1218,63842,[waiting],2007-05-25 10:33:38,2007-05-25 10:38:31,clarksm,nanowire,293.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:31
639381,639382,from joblog,1221,1218,63842,/nanowire,2007-05-25 10:33:43,2007-05-25 10:38:57,clarksm,nanowire,314.0,312.820007,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:57
639382,639383,from joblog,1221,1218,63842,[waiting],2007-05-25 10:33:43,2007-05-25 10:40:23,clarksm,nanowire,400.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:40:23
639383,639384,from joblog,1222,1218,63842,/nanowire,2007-05-25 10:33:48,2007-05-25 10:57:56,clarksm,nanowire,1448.0,716.179993,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:57:56
639384,639385,from joblog,1222,1218,63842,[waiting],2007-05-25 10:33:48,2007-05-25 10:39:09,clarksm,nanowire,321.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:39:09
639385,639386,from joblog,1223,1218,63842,/nanowire,2007-05-25 10:33:53,2007-05-25 10:58:01,clarksm,nanowire,1448.0,715.210022,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:58:01
639386,639387,from joblog,1223,1218,63842,[waiting],2007-05-25 10:33:53,2007-05-25 10:39:11,clarksm,nanowire,318.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:39:11


In [26]:
display(df['superjob'].value_counts())

0          170898668
7632182       292329
7633035       187980
7632782       183160
7752705       174145
             ...    
7975953            1
7975891            1
6075454            1
6075505            1
1328210            1
Name: superjob, Length: 218949, dtype: int64

In [27]:
display(len(df['superjob'].unique()))

218949

In [28]:
display(len(df['user'].unique()))

155307

In [None]:
df_filtered = filter_nulls(df, 'user')
display(len(df_filtered['user'].unique()))