In [1]:
from pathlib import Path
from functools import reduce
import pandas as pd
import numpy as np
import os
from datetime import datetime

from nanoHUB.application import Application
from nanoHUB.rfm.model import LastUpdateRecord, TempUserDescriptors, UserDescriptors

from sqlalchemy import select
from sqlalchemy.orm import Session

application = Application.get_instance()

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

In [3]:
def get_toolevents_df() -> pd.DataFrame:
    return pd.read_parquet(Path(NANOHUB_METRICS_CACHE, 'toolevents'))

def filter_nulls(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    df[col_name] = df[col_name].str.strip()
    df = df[~df[col_name].isna()]
    return df[df[col_name] != '']


In [None]:
df = get_toolevents_df()
display(df)

In [None]:
display(df.superjob != 0)

In [None]:
df1 = df[(df.superjob != 0) & (~df['tool'].isin(['nanowire', '1dhetero_r742']))]
display(df1)

In [None]:
all_sources = df["source"].unique()
display(all_sources)

In [None]:
all_superjobs = df["superjob"].unique()
display(all_superjobs)

In [None]:
df1 = df[(df.superjob == 1218)]
display(df1)

In [None]:
display(df['superjob'].value_counts())

In [None]:
display(len(df['superjob'].unique()))

In [None]:
display(len(df['user'].unique()))

In [None]:
df_filtered = filter_nulls(df, 'user')
display(len(df_filtered['user'].unique()))

In [None]:
df1 = df[(df.source == 'from toolstart')]
display(df1)

In [None]:
df_users = df.groupby(['user'])['start'].count()
display(df_users)

In [None]:
df_users = df.groupby(['user'])['finish'].count()
display(df_users)

In [None]:
df1 = df[(df.start >= '2008-07-03 04:53:38')]
display(df1)
display(df1.describe())

In [None]:
df1 = df[(df.start >= '2008-07-03 04:53:38')]['user']
display(df1)

In [None]:
active_users = df.loc[df.start >= '2008-07-03 04:53:38', 'user'].unique()
display(active_users)
display(len(active_users))

In [None]:
df2 = df[df['user'].isin(active_users)]

In [None]:
display(df2)

In [None]:
display(len(df2['user'].unique()))

In [None]:
df_users = df2.groupby(['user'])['start'].count()
display(df_users)

In [None]:
df_start_max_min = df.groupby(['user']).agg(first_start_date=('start', np.min), last_start_date=('start', np.max)).reset_index()
display(df_start_max_min)

In [None]:
df_finish_max_min = df.groupby(['user']).agg(first_finish_date=('finish', np.min), last_finish_date=('finish', np.max)).reset_index()
display(df_finish_max_min)

In [None]:
df_merged = reduce(lambda left,right: pd.merge(left, right, on=['user'], how='outer'), [
        df_start_max_min,
        df_finish_max_min
    ])
display(df_merged)

In [None]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
display(df_merged)

In [None]:
df_merged['toolevents__lifetime'] = (df_merged['last_finish_date'] - df_merged['first_start_date']).dt.days
display(df_merged)

In [None]:
df1 = df[(df.source == 'from toolstart')]
display(df1)