##### Author: Praveen Saxena
##### Email: saxep01@gmail.com
##### Create Date: 7/24/2021
##### Purpose: Analyze the table _event_table_ in database _wang159_myrmekes_.

--------------------

# Input

In [1]:
database = 'wang159_myrmekes'
table = 'user_activity_blocks'

In [2]:
db_table = "%s.%s" % (database, table)
print(db_table)

wang159_myrmekes.user_activity_blocks


--------------------

# Preliminaries

In [3]:
%%capture 

import pandas as pd
from pprint import pprint
from IPython.display import display, Markdown
import os

from nanoHUB.application import Application

application = Application.get_instance()
engine = application.new_db_engine(database)

--------------------

# Table Information

## Table Indexes

In [4]:
sql = '''
SHOW INDEX FROM %s; 
'''

df = pd.read_sql(sql % db_table, engine)
display(df)

Unnamed: 0,Table,Non_unique,Key_name,Seq_in_index,Column_name,Collation,Cardinality,Sub_part,Packed,Null,Index_type,Comment,Index_comment
0,user_activity_blocks,1,ix_user_activity_blocks_index,1,index,A,1525,,,YES,BTREE,,


## Table Columns

In [5]:
sql = '''
SHOW FULL COLUMNS FROM %s;
'''
df = pd.read_sql(sql % db_table, engine)
display(df.head())

Unnamed: 0,Field,Type,Collation,Null,Key,Default,Extra,Privileges,Comment
0,index,bigint(20),,YES,MUL,,,select,
1,user,text,latin1_swedish_ci,YES,,,,select,
2,tool,text,latin1_swedish_ci,YES,,,,select,
3,start,datetime,,YES,,,,select,
4,end,datetime,,YES,,,,select,


--------------------

# Data Information

## Data Query

In [6]:
sql = '''
SELECT *
    FROM %s
LIMIT 100;
'''
df = pd.read_sql(sql % db_table, engine)

## Data Info

In [7]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   index         100 non-null    int64         
 1   user          100 non-null    object        
 2   tool          100 non-null    object        
 3   start         100 non-null    datetime64[ns]
 4   end           100 non-null    datetime64[ns]
 5   ip            100 non-null    object        
 6   lon           100 non-null    float64       
 7   lat           100 non-null    float64       
 8   cluster       80 non-null     float64       
 9   scanned_date  100 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(3), int64(1), object(3)
memory usage: 7.9+ KB


None

## Data Values

In [8]:
display(df.head())
display(df.tail())

Unnamed: 0,index,user,tool,start,end,ip,lon,lat,cluster,scanned_date
0,0,bafflerbach,jupyter,2020-04-11,2020-04-15,68.117.135.85,-89.508,43.0736,12.0,2020-04-15
1,1,bsutton3,oof2,2020-04-08,2020-04-12,98.214.64.82,-89.6283,40.7122,3.0,2020-04-12
2,2,drkiraco,veda,2020-04-10,2020-04-16,208.102.33.153,-84.4296,39.139,,1900-01-01
3,3,erikzupa6,contactmaps,2020-04-12,2020-04-16,176.199.211.251,8.6865,50.1252,,1900-01-01
4,4,kguo34,ucb_compnano,2020-04-12,2020-04-16,174.63.7.166,-71.1282,42.2796,4.0,2020-04-16


Unnamed: 0,index,user,tool,start,end,ip,lon,lat,cluster,scanned_date
95,95,veeramohanrao.mareedu,funuq,2020-04-10,2020-04-14,157.44.252.159,78.4574,17.3846,,1900-01-01
96,96,veeramohanrao.mareedu,fgmbuilder,2020-04-10,2020-04-14,157.44.208.211,78.4574,17.3846,,1900-01-01
97,97,veeramohanrao.mareedu,tellurium,2020-04-10,2020-04-14,157.44.208.211,78.4574,17.3846,,1900-01-01
98,98,veeramohanrao.mareedu,nanoplasticity,2020-04-10,2020-04-14,157.44.252.159,78.4574,17.3846,,1900-01-01
99,99,bullock8,matdatarepo,2020-04-11,2020-04-15,128.211.186.107,-86.9147,40.4251,7.0,2020-04-15


## Tools and Clusters

In [9]:
timeline_domains_df = df.groupby('tool')['cluster'].unique().reset_index()
pd.options.display.max_colwidth = 5000
display('Tools Used', timeline_domains_df['tool'].unique())
display(timeline_domains_df)
pd.reset_option('max_colwidth')

'Tools Used'

array(['1dhetero', 'adeptnpt', 'atcadlab', 'bandstrlab', 'bjt',
       'cntbands-ext', 'cntfet', 'contactmaps', 'contourpv',
       'crystal_viewer', 'cvgraph', 'dftmatprop', 'dftqe', 'dualfoil',
       'echem', 'fettoy', 'fgmbuilder', 'fin', 'funuq', 'gprpiezo',
       'gpuheompop', 'jupyter', 'jupyter60', 'matdatarepo', 'matsimtk',
       'meep', 'moscntr', 'mosfetsat', 'mseml', 'msl', 'nacresimulator',
       'nanomatmech', 'nanomos', 'nanoplasticity', 'nmst_dft',
       'npassemblylab', 'oof2', 'padre', 'pc4covid19', 'pnjunctionlab',
       'prolabox', 'pvlimits', 'pvpanelsim', 'reaxff597a', 's4sim',
       'siesta', 'straincalc', 'tellurium', 'ucb_compnano', 'veda',
       'vides', 'workspace'], dtype=object)

Unnamed: 0,tool,cluster
0,1dhetero,[4.0]
1,adeptnpt,[0.0]
2,atcadlab,[4.0]
3,bandstrlab,[3.0]
4,bjt,"[8.0, 3.0, 5.0]"
5,cntbands-ext,[4.0]
6,cntfet,[1.0]
7,contactmaps,[nan]
8,contourpv,[0.0]
9,crystal_viewer,[5.0]


## First & Last Points by DateTime

In [10]:
sorted_start_datetime_df = df.sort_values(by='start')

In [11]:
first = sorted_start_datetime_df.iloc[0]
first = pd.Series(first, name='First Data Point').to_frame()
display(first)

last = sorted_start_datetime_df.iloc[-1]
last = pd.Series(last, name='Last Data Point').to_frame()
display(last)


Unnamed: 0,First Data Point
index,74
user,medhoorumouni123
tool,fin
start,2020-04-08 00:00:00
end,2020-04-12 00:00:00
ip,223.238.35.110
lon,83.2016
lat,17.6801
cluster,
scanned_date,1900-01-01 00:00:00


Unnamed: 0,Last Data Point
index,84
user,colewilloughby1125
tool,matdatarepo
start,2020-04-12 00:00:00
end,2020-04-16 00:00:00
ip,52.119.107.194
lon,-86.9256
lat,40.4444
cluster,10.0
scanned_date,2020-04-16 00:00:00
