In [1]:
import nimble_tk as ntk

2023-12-10 13:28:00.469683 :: MainThread ::  Log init with given logger


In [2]:
import pandas as pd
import numpy as np
import datetime
import random

# A simple function to create some random data for testing the functionality
def create_random_data(n_rows: int = 100) -> pd.DataFrame:
    rows = []
    for _ in range(n_rows):
        row = {}
        row['A'] = random.choice(['ham', 'eggs'])
        row['B'] = random.choice(['alpha', 'beta', 'gamma'])
        row['C'] = np.random.choice(pd.date_range(
            datetime.datetime(2023, 6, 1), datetime.datetime(2023, 10, 1)))
        row['D'] = np.random.randn()
        row['E'] = np.random.randint(0, 4)
        rows.append(row)
    df = pd.DataFrame(rows)
    return df

df = create_random_data(n_rows=100)
df.head()

Unnamed: 0,A,B,C,D,E
0,ham,alpha,2023-08-08,0.21,2
1,ham,beta,2023-08-27,0.089,2
2,eggs,alpha,2023-08-15,0.302,3
3,ham,alpha,2023-06-14,-0.012,0
4,ham,beta,2023-06-22,-0.064,2


&nbsp;

&nbsp;

# Logging

The below set of cells give some utility wrappers on python's logging functionality.

- The following line will create a log file at `/tmp/try_nimble.log`
- As the logs get written, the log file will go up to max `50 MB` in size by default and then will get rolled over.
- At most, `10` such files are maintained before removing the earliest file.
- Logs will also be written to the console (in this case the notebook console) 


In [4]:
ntk.init_file_logger(log_file_path='/tmp/try_nimble.log', console_log_on=True)

2023-12-10 13:28:00.710634 :: MainThread ::  Log file init at /tmp/try_nimble.log


Example of an info log:

In [5]:
ntk.log_info("some log message")

2023-12-10 13:28:00.732400 :: MainThread ::  some log message


Example of an error log:

In [6]:
ntk.log_error("some error message")

2023-12-10 13:28:00.748484 :: MainThread ::  some error message


Catching an exception and logging the stack trace:

In [7]:
try:
    tmp = 1/0
except:
    ntk.log_traceback("Error while running operation")

2023-12-10 13:28:00.768018 :: MainThread ::  Error while running operation :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_6837/4084614229.py", line 2, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


All logs are also written to the log file:

In [8]:
! tail -10 /tmp/try_nimble.log

2023-12-10 13:28:00,708 MainThread logger.py:88:  Log file init at /tmp/try_nimble.log
2023-12-10 13:28:00,727 MainThread logger.py:88:  some log message
2023-12-10 13:28:00,744 MainThread logger.py:88:  some error message
2023-12-10 13:28:00,764 MainThread logger.py:88:  Error while running operation :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_6837/4084614229.py", line 2, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


&nbsp;

Logs can also be directly written to the file <b>without logging on the console</b>. This helps in cases where we do not want to flood the console with a lot of logs.

In [9]:
ntk.log_info_file("some log message")
ntk.log_error_file("some error message")

try:
    tmp = 1/0
except:
    ntk.log_traceback_file("Error while running operation")

In [10]:
! tail -7 /tmp/try_nimble.log

2023-12-10 13:28:00,928 MainThread logger.py:88:  some log message
2023-12-10 13:28:00,933 MainThread logger.py:88:  some error message
2023-12-10 13:28:00,938 MainThread logger.py:88:   :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_6837/3515687902.py", line 5, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


&nbsp;

# Concurrent Processing

In [11]:
def analytic_function(customer_id):
    ntk.log_info_file(f"Running analytic_function for customer_id: {customer_id}")
    
    if customer_id == 0:
        # raising an error here to demonstrate error handling
        raise ValueError(f"Invalid customer id {customer_id}")
        
    # - Query the DB
    # - Do feature engineering
    # - Run other analytics
    
    # - Write the result data to disk or return a DF
    return pd.DataFrame({
        'CUSTOMER_ID': [customer_id]*5,
        'OTHER_DATA': np.random.randint(0, 9, 5)
    })

functions = []
for customer_id in range(10):
    functions.append((analytic_function, {'customer_id': customer_id}))

results, errors = ntk.run_concurrently(functions, max_workers=ntk.get_num_cpus(), fork=True)

df_result = pd.concat([result[1] for result in results])
df_result.head().display(index=False)

2023-12-10 13:28:01.170778 :: MainThread ::  Ran: 10 functions - Successfull: 9, Failed: 1


CUSTOMER_ID,OTHER_DATA
1,1
1,6
1,3
1,7
1,1


In [14]:
for error in errors:
    print('Error for', error[0], ntk.exception_to_trace_string(error[1]))

Error for customer_id = 0
 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/tmp/ipykernel_6837/4234218793.py", line 6, in analytic_function
    raise ValueError(f"Invalid customer id {customer_id}")
ValueError: Invalid customer id 0
"""


# Analytics

Some utility functions to print custom html and also display the pandas dataframe multiple times in a cell:

In [15]:
ntk.h3("Some data:")
df.head().display()

ntk.html("<h3>Some more data:</h3>")
df.tail().display(index=False)

Unnamed: 0,A,B,C,D,E
0,ham,alpha,2023-08-08,0.21,2
1,ham,beta,2023-08-27,0.089,2
2,eggs,alpha,2023-08-15,0.302,3
3,ham,alpha,2023-06-14,-0.012,0
4,ham,beta,2023-06-22,-0.064,2


A,B,C,D,E
eggs,gamma,2023-07-31,-0.606,3
ham,beta,2023-08-21,2.017,2
eggs,beta,2023-08-09,-0.599,1
ham,gamma,2023-08-31,1.04,0
eggs,alpha,2023-08-12,0.315,1


&nbsp;

## Some Pandas utility methods:

### `series.value_counts_percentage()` of `series.vcp()`

In [16]:
df.A.vcp()

Unnamed: 0_level_0,COUNT,PERC
A,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,50,0.5
eggs,50,0.5


### `df.to_map(col1, col2)` - creates a dictionary with col1 as keys and col2 as values

In [17]:
df.head().to_map('C', 'D')

{Timestamp('2023-08-08 00:00:00'): 0.21002047388754322,
 Timestamp('2023-08-27 00:00:00'): 0.08918589087189265,
 Timestamp('2023-08-15 00:00:00'): 0.30204113925114556,
 Timestamp('2023-06-14 00:00:00'): -0.012436150595751778,
 Timestamp('2023-06-22 00:00:00'): -0.06412325027102769}

&nbsp; 

### `df.rsort(...)` - a shortcut for reverse sort

In [18]:
df.rsort('B').head()

Unnamed: 0,A,B,C,D,E
61,eggs,gamma,2023-09-11,0.042,3
21,eggs,gamma,2023-07-05,0.628,3
90,eggs,gamma,2023-07-16,-0.706,0
60,eggs,gamma,2023-07-13,-0.703,2
25,eggs,gamma,2023-09-14,-0.832,2


&nbsp; 

### `series.mcut(...)` - a wrapper over the `series.cut(..)` function.

This function makes it easier to divide the data into custom bins. The `bins` parameter is a list of numbers which form the boundaries of bins. 

In [19]:
cuts = df.D.mcut(bins=[-2, -1, 0, 1, 2])
cuts.value_counts().sort_index().to_frame()

Unnamed: 0_level_0,count
D,Unnamed: 1_level_1
-1 - 0,41
-2 - -1,13
0 - 1,28
1 - 2,13
>= 2,1
,4


# Files

Simple utilities to quickly read/write data in files.  

In [20]:
content = """
This is some text.
"""
ntk.write_to_file(filepath='/tmp/tmp_file.txt', content=content)

content = """
This is some more text. 
"""
ntk.append_to_file(filepath='/tmp/tmp_file.txt', content=content, strip=False)

In [21]:
print(ntk.read_file(filepath='/tmp/tmp_file.txt'))

This is some text.

This is some more text. 



&nbsp;

`write_to_excel` - A utility function to make it easy to write a multi-sheet excel file

In [22]:
dfs_map = {
    'sheet1': create_random_data(n_rows=20),
    'sheet2': create_random_data(n_rows=20),
    'sheet3': create_random_data(n_rows=20),
}

ntk.write_dfs_to_excel(dfs_map=dfs_map, file_name='/tmp/data.xlsx')

2023-12-10 13:28:03.344363 :: MainThread ::  Write to excel - /tmp/data.xlsx
2023-12-10 13:28:03.364009 :: MainThread ::  Write to excel - writing sheet - sheet1
2023-12-10 13:28:03.373395 :: MainThread ::  Write to excel - writing sheet - sheet2
2023-12-10 13:28:03.379992 :: MainThread ::  Write to excel - writing sheet - sheet3
2023-12-10 13:28:03.455109 :: MainThread ::  Write to excel - done


# Repeated Execution

We sometimes need to execute a particular task repeatedly at some fixed delay or interval. 
Any exceptions which occur are logged and do not affect sub-sequent exections. 

In [28]:
def code_to_run_repeatedly():
    ntk.log_info('Running a repeated task')

scheduler = ntk.FixedDelayTaskScheduler(interval=4, function=code_to_run_repeatedly, log_prefix='[Some][Log][Prefix]')

import time
time.sleep(21)
scheduler.shutdown()

2023-12-10 13:39:11.160126 :: Thread-26 :: [Some][Log][Prefix] Function Started
2023-12-10 13:39:11.165573 :: Thread-26 :: [Some][Log][Prefix] Running a repeated task
2023-12-10 13:39:11.171282 :: Thread-26 :: [Some][Log][Prefix] Function Done - took time 00:00:00.012
2023-12-10 13:39:15.160387 :: Thread-27 :: [Some][Log][Prefix] Function Started
2023-12-10 13:39:15.167263 :: Thread-27 :: [Some][Log][Prefix] Running a repeated task
2023-12-10 13:39:15.173637 :: Thread-27 :: [Some][Log][Prefix] Function Done - took time 00:00:00.012
2023-12-10 13:39:19.160709 :: Thread-28 :: [Some][Log][Prefix] Function Started
2023-12-10 13:39:19.165857 :: Thread-28 :: [Some][Log][Prefix] Running a repeated task
2023-12-10 13:39:19.169713 :: Thread-28 :: [Some][Log][Prefix] Function Done - took time 00:00:00.009
2023-12-10 13:39:23.163375 :: Thread-29 :: [Some][Log][Prefix] Function Started
2023-12-10 13:39:23.170748 :: Thread-29 :: [Some][Log][Prefix] Running a repeated task
2023-12-10 13:39:23.175528