In [1]:
from pathlib import Path
from functools import reduce
import pandas as pd
import numpy as np
import os
from datetime import datetime

from nanoHUB.application import Application
from nanoHUB.rfm.model import LastUpdateRecord, TempUserDescriptors, UserDescriptors

from sqlalchemy import select
from sqlalchemy.orm import Session

application = Application.get_instance()

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [11]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

In [30]:
def get_toolevents_df() -> pd.DataFrame:
    return pd.read_parquet(Path(NANOHUB_METRICS_CACHE, 'toolevents'))

def filter_nulls(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df[col_name] = df[col_name].str.strip()
    df = df[~df[col_name].isna()]
    return df[df[col_name] != '']


In [13]:
df = get_toolevents_df()
display(df)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.0000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.0000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.0000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.0000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.4805,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


In [14]:
display(df.superjob != 0)

0             True
1             True
2             True
3             True
4             True
             ...  
200150852    False
200150853     True
200150854     True
200150855     True
200150856    False
Name: job, Length: 200150857, dtype: bool

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.000000,-1.000000,44.82678,11.62071,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150850,200399645,from joblog,8424133,0,1861987,/probercacsite.sh,2021-08-07 01:00:08,2021-08-07 01:02:34,gridstat,,0.000000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:02:34
200150851,200399646,from joblog,8422264,0,1873253,start_jupyter,2021-08-06 01:04:11,2021-08-07 01:04:13,sachinkumarsaid01,mos_r14,86402.036749,11.993140,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:13
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.000000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.000000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55


In [18]:
df1 = df[(df.superjob != 0) & (~df['tool'].isin(['nanowire', '1dhetero_r742']))]
display(df1)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
644049,644050,from joblog,575,574,60473,/nanowire.tganl64,2007-05-08 11:53:32,2007-05-08 13:33:53,clarksm,workspace-med,6021.0,143550.687500,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 13:33:53
644050,644051,from joblog,575,574,60473,[waiting],2007-05-08 11:53:32,2007-05-08 11:53:50,clarksm,workspace-med,18.0,0.000000,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 11:53:50
644051,644052,from joblog,576,574,60473,/nanowire.tganl64,2007-05-08 11:53:37,2007-05-08 13:38:46,clarksm,workspace-med,6309.0,150279.546875,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 13:38:46
644052,644053,from joblog,576,574,60473,[waiting],2007-05-08 11:53:37,2007-05-08 11:53:51,clarksm,workspace-med,14.0,0.000000,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 11:53:51
644054,644055,from joblog,578,577,60473,/nanowire.tganl64,2007-05-08 14:19:11,2007-05-08 15:59:23,clarksm,workspace-med,6012.0,143488.312500,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-08 15:59:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150589,200399384,from joblog,8424047,8423431,1873627,:lammps-09Dec14-parallel:,2021-08-06 21:33:50,2021-08-06 21:35:38,johanmcastillo24,,3.0,2.120000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:35:38
200150594,200399389,from joblog,8424048,8423431,1873627,[waiting],2021-08-06 21:35:41,2021-08-06 21:37:50,johanmcastillo24,,8.0,0.000000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:37:50
200150595,200399390,from joblog,8424048,8423431,1873627,:lammps-09Dec14-parallel:,2021-08-06 21:35:41,2021-08-06 21:37:50,johanmcastillo24,,4.0,1.920000,9.933330,-84.083330,San Jose,San Jose,Costa Rica,CR,2021-08-06 21:37:50
200150822,200399617,from joblog,8423463,8423462,1872710,[waiting],2021-08-06 16:21:52,2021-08-07 00:17:14,carlheinzcb,,13195.0,0.000000,49.922620,4.082590,Hirson,Picardie,France,FR,2021-08-07 00:17:14


In [20]:
all_sources = df["source"].unique()
display(all_sources)

array(['from toolstart', 'from joblog'], dtype=object)

In [22]:
all_superjobs = df["superjob"].unique()
display(all_superjobs)

array([      0,    1218,    1325, ..., 8423979, 8424040, 8423462])

In [23]:
df1 = df[(df.superjob == 1218)]
display(df1)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
639377,639378,from joblog,1219,1218,63842,/nanowire,2007-05-25 10:33:33,2007-05-25 10:38:26,clarksm,nanowire,293.0,292.279999,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:26
639378,639379,from joblog,1219,1218,63842,[waiting],2007-05-25 10:33:33,2007-05-25 10:36:58,clarksm,nanowire,205.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:36:58
639379,639380,from joblog,1220,1218,63842,/nanowire,2007-05-25 10:33:38,2007-05-25 11:34:53,clarksm,nanowire,3675.0,3623.209961,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 11:34:53
639380,639381,from joblog,1220,1218,63842,[waiting],2007-05-25 10:33:38,2007-05-25 10:38:31,clarksm,nanowire,293.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:31
639381,639382,from joblog,1221,1218,63842,/nanowire,2007-05-25 10:33:43,2007-05-25 10:38:57,clarksm,nanowire,314.0,312.820007,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:38:57
639382,639383,from joblog,1221,1218,63842,[waiting],2007-05-25 10:33:43,2007-05-25 10:40:23,clarksm,nanowire,400.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:40:23
639383,639384,from joblog,1222,1218,63842,/nanowire,2007-05-25 10:33:48,2007-05-25 10:57:56,clarksm,nanowire,1448.0,716.179993,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:57:56
639384,639385,from joblog,1222,1218,63842,[waiting],2007-05-25 10:33:48,2007-05-25 10:39:09,clarksm,nanowire,321.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:39:09
639385,639386,from joblog,1223,1218,63842,/nanowire,2007-05-25 10:33:53,2007-05-25 10:58:01,clarksm,nanowire,1448.0,715.210022,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:58:01
639386,639387,from joblog,1223,1218,63842,[waiting],2007-05-25 10:33:53,2007-05-25 10:39:11,clarksm,nanowire,318.0,0.0,40.368891,-86.877431,West Lafayette,Indiana,United States,US,2007-05-25 10:39:11


In [26]:
display(df['superjob'].value_counts())

0          170898668
7632182       292329
7633035       187980
7632782       183160
7752705       174145
             ...    
7975953            1
7975891            1
6075454            1
6075505            1
1328210            1
Name: superjob, Length: 218949, dtype: int64

In [27]:
display(len(df['superjob'].unique()))

218949

In [28]:
display(len(df['user'].unique()))

155307

In [31]:
df_filtered = filter_nulls(df, 'user')
display(len(df_filtered['user'].unique()))

155306

In [33]:
df1 = df[(df.source == 'from toolstart')]
display(df1)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
0,1,from toolstart,4904,0,0,Schred,2002-07-03 04:53:38,2002-07-03 04:53:56,aless,Schred,18.0,-1.0,44.826780,11.620710,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:53:56
1,2,from toolstart,4905,0,0,Schred,2002-07-03 04:56:33,2002-07-03 04:56:50,aless,Schred,17.0,-1.0,44.826780,11.620710,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 04:56:50
2,3,from toolstart,4906,0,0,Schred,2002-07-03 05:00:29,2002-07-03 05:00:59,aless,Schred,30.0,-1.0,44.826780,11.620710,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:00:59
3,4,from toolstart,4907,0,0,Schred,2002-07-03 05:11:47,2002-07-03 05:12:32,aless,Schred,45.0,-1.0,44.826780,11.620710,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:12:32
4,5,from toolstart,4908,0,0,Schred,2002-07-03 05:13:50,2002-07-03 05:14:10,aless,Schred,20.0,-1.0,44.826780,11.620710,Ferrara,Emilia-Romagna,Italy,IT,2002-07-03 05:14:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407236,407237,from toolstart,467076,0,508422,spice3f4,2006-02-15 02:12:49,2006-02-15 02:12:48,nelson11,spice3f4,-1.0,-1.0,40.395633,-74.110588,Middletown,New Jersey,United States,US,2006-02-15 02:12:48
407237,407238,from toolstart,467077,0,508422,spice3f4,2006-02-15 02:15:45,2006-02-15 02:15:44,nelson11,spice3f4,-1.0,-1.0,40.395633,-74.110588,Middletown,New Jersey,United States,US,2006-02-15 02:15:44
407238,407239,from toolstart,467078,0,508422,spice3f4,2006-02-15 02:16:30,2006-02-15 02:16:29,nelson11,spice3f4,-1.0,-1.0,40.395633,-74.110588,Middletown,New Jersey,United States,US,2006-02-15 02:16:29
407239,407240,from toolstart,467079,0,0,spice3f4,2006-02-15 02:27:25,2006-02-15 02:27:24,nelson11,spice3f4,-1.0,-1.0,40.395633,-74.110588,Middletown,New Jersey,United States,US,2006-02-15 02:27:24


In [39]:
df_users = df.groupby(['user'])['start'].count()
display(df_users)

user
                    430
0.yao.yuan           15
008dilip             14
00ff                  2
00thamizharasi00      1
                   ... 
zzz121243            17
zzz1ttt              60
zzz777                9
zzzstas              40
zzzwmhq               5
Name: start, Length: 155307, dtype: int64

In [41]:
df_users = df.groupby(['user'])['finish'].count()
display(df_users)

user
                    885
0.yao.yuan           15
008dilip             14
00ff                  2
00thamizharasi00      1
                   ... 
zzz121243            17
zzz1ttt              60
zzz777                9
zzzstas              40
zzzwmhq               5
Name: finish, Length: 155307, dtype: int64

In [44]:
df1 = df[(df.start >= '2008-07-03 04:53:38')]
display(df1)
display(df1.describe())

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
984059,984060,from joblog,0,0,131406,application,2008-07-03 06:52:14,2008-07-03 06:58:56,pcristea,adept_r16,401.710000,19.379000,44.43225,26.10626,Bucharest,Bucuresti,Romania,RO,2008-07-03 06:58:56
984060,984061,from joblog,1,0,131406,simulation,2008-07-03 06:52:53,2008-07-03 06:52:58,pcristea,adept_r16,4.544117,4.166366,44.43225,26.10626,Bucharest,Bucuresti,Romania,RO,2008-07-03 06:52:58
984061,984062,from joblog,2,0,131406,simulation,2008-07-03 06:54:21,2008-07-03 06:54:24,pcristea,adept_r16,3.336763,2.928555,44.43225,26.10626,Bucharest,Bucuresti,Romania,RO,2008-07-03 06:54:24
984140,984141,from joblog,0,0,131412,application,2008-07-03 08:23:23,2008-07-03 09:11:11,vinceR,cnia_r15,2867.930000,21.494000,48.68333,2.13333,Gif-sur-Yvette,Ile-de-France,France,FR,2008-07-03 09:11:11
984141,984142,from joblog,1,0,131412,simulation,2008-07-03 08:25:17,2008-07-03 08:25:23,vinceR,cnia_r15,6.053635,2.360642,48.68333,2.13333,Gif-sur-Yvette,Ile-de-France,France,FR,2008-07-03 08:25:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.000000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.000000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.000000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.480500,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


Unnamed: 0,entryID,job,superjob,sessnum,walltime,cputime,latitude,longitude
count,199163500.0,199163500.0,199163500.0,199163500.0,199163500.0,199163500.0,199163500.0,199163500.0
mean,100798100.0,5977640.0,1119753.0,1555635.0,278226.5,3190.128,33.88485,-65.76182
std,57523680.0,3168582.0,2714847.0,259610.0,20388560.0,3168760.0,11.1432,72.01113
min,984060.0,0.0,0.0,121936.0,0.0,0.0,-53.15,-165.1972
25%,51026090.0,6069316.0,0.0,1580509.0,0.0,0.0,32.8761,-117.2318
50%,100817000.0,7750343.0,0.0,1634861.0,10.0,2.32,32.8761,-86.87743
75%,150607800.0,7844466.0,0.0,1656518.0,60.0,14.55209,40.36889,-73.89125
max,200399700.0,8424134.0,8424040.0,1873762.0,1627324000.0,22749490000.0,69.6489,178.4415


In [None]:
df1 = df[(df.start >= '2008-07-03 04:53:38')]['user']
display(df1)

In [50]:
active_users = df.loc[df.start >= '2008-07-03 04:53:38', 'user'].unique()
display(active_users)
display(len(active_users))

array(['pcristea', 'vinceR', 'gekco', ..., 'vivekchawla093',
       'ananya.vaidya110987', 'jacuna'], dtype=object)

141451

In [51]:
df2 = df[df['user'].isin(active_users)]

In [52]:
display(df2)

Unnamed: 0,entryID,source,job,superjob,sessnum,event,start,finish,user,tool,walltime,cputime,latitude,longitude,city,region,countryLong,countryShort,datetime
18,19,from toolstart,4923,0,0,Schred,2002-07-05 14:33:43,2002-07-05 14:35:14,noise,Schred,91.0000,-1.000000,32.78306,-96.80667,Dallas,Texas,United States,US,2002-07-05 14:35:14
19,20,from toolstart,4924,0,0,Schred,2002-07-05 14:49:27,2002-07-05 14:51:10,noise,Schred,103.0000,-1.000000,32.78306,-96.80667,Dallas,Texas,United States,US,2002-07-05 14:51:10
20,21,from toolstart,4925,0,0,Schred,2002-07-05 14:58:02,2002-07-05 14:59:18,noise,Schred,76.0000,-1.000000,32.78306,-96.80667,Dallas,Texas,United States,US,2002-07-05 14:59:18
73,74,from toolstart,4978,0,0,MOSCV,2002-07-10 00:11:14,2002-07-10 00:11:13,noise,MOSCV,-1.0000,-1.000000,32.78306,-96.80667,Dallas,Texas,United States,US,2002-07-10 00:11:13
74,75,from toolstart,4979,0,0,MOSCV,2002-07-10 00:12:57,2002-07-10 00:12:56,noise,MOSCV,-1.0000,-1.000000,32.78306,-96.80667,Dallas,Texas,United States,US,2002-07-10 00:12:56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200150852,200399647,from joblog,0,0,1873253,application,2021-08-06 01:04:05,2021-08-07 01:04:16,sachinkumarsaid01,mos_r14,6.0000,26.340000,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:04:16
200150853,200399648,from joblog,8424134,0,1861987,[waiting],2021-08-07 01:03:10,2021-08-07 01:05:36,gridstat,,50.0000,0.000000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:05:36
200150854,200399649,from joblog,8424134,0,1861987,/probercacsite.sh,2021-08-07 01:03:10,2021-08-07 01:07:55,gridstat,,0.0000,0.020000,32.87610,-117.23180,La Jolla,California,United States,US,2021-08-07 01:07:55
200150855,200399650,from joblog,8422269,0,1873260,start_jupyter,2021-08-06 01:07:26,2021-08-07 01:09:13,sachinkumarsaid01,mos_r14,86507.4805,33.746471,52.18446,-0.68759,Warrington,England,United Kingdom,GB,2021-08-07 01:09:13


In [54]:
display(len(df2['user'].unique()))

141451

In [55]:
df_users = df2.groupby(['user'])['start'].count()
display(df_users)

user
                    430
0.yao.yuan           15
008dilip             14
00ff                  2
00thamizharasi00      1
                   ... 
zzz121243            17
zzz1ttt              60
zzz777                9
zzzstas              40
zzzwmhq               5
Name: start, Length: 141451, dtype: int64

In [60]:
df_start_max_min = df.groupby(['user']).agg(first_start_date=('start', np.min), last_start_date=('start', np.max)).reset_index()
display(df_start_max_min)

Unnamed: 0,user,first_start_date,last_start_date
0,,2011-11-22 09:44:56,2013-05-28 15:46:14
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36
...,...,...,...
155302,zzz121243,2021-05-04 19:04:27,2021-05-04 19:21:22
155303,zzz1ttt,2015-10-21 21:42:30,2015-12-17 06:47:37
155304,zzz777,2016-08-08 15:35:26,2016-08-08 16:05:25
155305,zzzstas,2016-07-13 00:40:03,2016-07-13 06:33:44


In [61]:
df_finish_max_min = df.groupby(['user']).agg(first_finish_date=('finish', np.min), last_finish_date=('finish', np.max)).reset_index()
display(df_finish_max_min)

Unnamed: 0,user,first_finish_date,last_finish_date
0,,2005-11-11 00:00:00,2013-05-28 15:46:17
1,0.yao.yuan,2017-03-08 10:16:34,2017-03-09 07:22:59
2,008dilip,2016-09-28 20:32:31,2016-10-13 00:08:08
3,00ff,2013-11-19 01:01:19,2013-11-19 01:04:21
4,00thamizharasi00,2017-03-20 09:46:37,2017-03-20 09:46:37
...,...,...,...
155302,zzz121243,2021-05-04 19:18:31,2021-05-05 19:37:43
155303,zzz1ttt,2015-10-21 21:44:26,2015-12-17 09:14:10
155304,zzz777,2016-08-08 15:35:56,2016-08-08 16:07:32
155305,zzzstas,2016-07-13 00:53:35,2016-07-13 08:23:48


In [None]:
df['toolevents__lifetime'] = (df['max_datetime'] - df['min_datetime']).dt.days

In [62]:
df_merged = reduce(lambda left,right: pd.merge(left, right, on=['user'], how='outer'), [
        df_start_max_min,
        df_finish_max_min
    ])
display(df_merged)

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date
0,,2011-11-22 09:44:56,2013-05-28 15:46:14,2005-11-11 00:00:00,2013-05-28 15:46:17
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22,2017-03-08 10:16:34,2017-03-09 07:22:59
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56,2016-09-28 20:32:31,2016-10-13 00:08:08
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56,2013-11-19 01:01:19,2013-11-19 01:04:21
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36,2017-03-20 09:46:37,2017-03-20 09:46:37
...,...,...,...,...,...
155302,zzz121243,2021-05-04 19:04:27,2021-05-04 19:21:22,2021-05-04 19:18:31,2021-05-05 19:37:43
155303,zzz1ttt,2015-10-21 21:42:30,2015-12-17 06:47:37,2015-10-21 21:44:26,2015-12-17 09:14:10
155304,zzz777,2016-08-08 15:35:26,2016-08-08 16:05:25,2016-08-08 15:35:56,2016-08-08 16:07:32
155305,zzzstas,2016-07-13 00:40:03,2016-07-13 06:33:44,2016-07-13 00:53:35,2016-07-13 08:23:48


In [63]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
display(df_merged)

Unnamed: 0,user,first_start_date,last_start_date,first_finish_date,last_finish_date,toolevents__lifetime
0,,2011-11-22 09:44:56,2013-05-28 15:46:14,2005-11-11 00:00:00,2013-05-28 15:46:17,553
1,0.yao.yuan,2017-03-08 10:15:23,2017-03-09 05:09:22,2017-03-08 10:16:34,2017-03-09 07:22:59,0
2,008dilip,2016-09-28 20:27:13,2016-10-12 23:50:56,2016-09-28 20:32:31,2016-10-13 00:08:08,14
3,00ff,2013-11-19 00:53:23,2013-11-19 00:58:56,2013-11-19 01:01:19,2013-11-19 01:04:21,0
4,00thamizharasi00,2017-03-20 09:46:36,2017-03-20 09:46:36,2017-03-20 09:46:37,2017-03-20 09:46:37,0
...,...,...,...,...,...,...
155302,zzz121243,2021-05-04 19:04:27,2021-05-04 19:21:22,2021-05-04 19:18:31,2021-05-05 19:37:43,1
155303,zzz1ttt,2015-10-21 21:42:30,2015-12-17 06:47:37,2015-10-21 21:44:26,2015-12-17 09:14:10,56
155304,zzz777,2016-08-08 15:35:26,2016-08-08 16:05:25,2016-08-08 15:35:56,2016-08-08 16:07:32,0
155305,zzzstas,2016-07-13 00:40:03,2016-07-13 06:33:44,2016-07-13 00:53:35,2016-07-13 08:23:48,0


In [None]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
display(df_merged)