In [1]:
import nimble_tk as ntk

2023-12-10 12:47:51.991628 :: MainThread ::  Log init with given logger


In [2]:
import pandas as pd
import numpy as np
import datetime
import random

# A simple function to create some random data for testing the functionality
def create_random_data(n_rows: int = 100) -> pd.DataFrame:
    rows = []
    for _ in range(n_rows):
        row = {}
        row['A'] = random.choice(['ham', 'eggs'])
        row['B'] = random.choice(['alpha', 'beta', 'gamma'])
        row['C'] = np.random.choice(pd.date_range(
            datetime.datetime(2023, 6, 1), datetime.datetime(2023, 10, 1)))
        row['D'] = np.random.randn()
        row['E'] = np.random.randint(0, 4)
        rows.append(row)
    df = pd.DataFrame(rows)
    return df

df = create_random_data(n_rows=100)
df.head()

Unnamed: 0,A,B,C,D,E
0,ham,alpha,2023-08-05,0.358,0
1,ham,beta,2023-06-20,2.738,1
2,eggs,beta,2023-06-06,0.985,0
3,ham,alpha,2023-06-25,0.781,3
4,eggs,alpha,2023-06-09,-2.612,1


&nbsp;

&nbsp;

# Logging

The below set of cells give some utility wrappers on python's logging functionality.

- The following line will create a log file at `/tmp/try_nimble.log`
- As the logs get written, the log file will go up to max `50 MB` in size by default and then will get rolled over.
- At most, `10` such files are maintained before removing the earliest file.
- Logs will also be written to the console (in this case the notebook console) 


In [4]:
ntk.init_file_logger(log_file_path='/tmp/try_nimble.log', console_log_on=True)

2023-12-10 12:47:52.273430 :: MainThread ::  Log file init at /tmp/try_nimble.log


Example of an info log:

In [5]:
ntk.log_info("some log message")

2023-12-10 12:47:52.303186 :: MainThread ::  some log message


Example of an error log:

In [6]:
ntk.log_error("some error message")

2023-12-10 12:47:52.324273 :: MainThread ::  some error message


Catching an exception and logging the stack trace:

In [7]:
try:
    tmp = 1/0
except:
    ntk.log_traceback("Error while running operation")

2023-12-10 12:47:52.345504 :: MainThread ::  Error while running operation :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_5851/4084614229.py", line 2, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


All logs are also written to the log file:

In [8]:
! tail -10 /tmp/try_nimble.log

2023-12-10 12:47:52,270 MainThread logger.py:88:  Log file init at /tmp/try_nimble.log
2023-12-10 12:47:52,289 MainThread logger.py:88:  some log message
2023-12-10 12:47:52,318 MainThread logger.py:88:  some error message
2023-12-10 12:47:52,343 MainThread logger.py:88:  Error while running operation :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_5851/4084614229.py", line 2, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


&nbsp;

Logs can also be directly written to the file <b>without logging on the console</b>. This helps in cases where we do not want to flood the console with a lot of logs.

In [9]:
ntk.log_info_file("some log message")
ntk.log_error_file("some error message")

try:
    tmp = 1/0
except:
    ntk.log_traceback_file("Error while running operation")

In [10]:
! tail -7 /tmp/try_nimble.log

2023-12-10 12:47:52,502 MainThread logger.py:88:  some log message
2023-12-10 12:47:52,508 MainThread logger.py:88:  some error message
2023-12-10 12:47:52,511 MainThread logger.py:88:   :: ZeroDivisionError: division by zero ::
 Traceback (most recent call last):
  File "/tmp/ipykernel_5851/3515687902.py", line 5, in <module>
    tmp = 1/0
ZeroDivisionError: division by zero


&nbsp;

# Concurrent Processing

In [11]:
def analytic_function(customer_id):
    ntk.log_info_file(f"Running analytic_function for customer_id: {customer_id}")
    
    if customer_id == 0:
        # raising an error here to demonstrate error handling
        raise ValueError(f"Invalid customer id {customer_id}")
        
    # - Query the DB
    # - Do feature engineering
    # - Run other analytics
    
    # - Write the result data to disk or return a DF
    return pd.DataFrame({
        'CUSTOMER_ID': [customer_id]*5,
        'OTHER_DATA': np.random.randint(0, 9, 5)
    })

functions = []
for customer_id in range(10):
    functions.append((analytic_function, {'customer_id': customer_id}))

results, errors = ntk.run_concurrently(functions, max_workers=ntk.get_num_cpus(), fork=True)

df_result = pd.concat([result[1] for result in results])
df_result.head().display(index=False)

2023-12-10 12:47:52.749708 :: MainThread ::  Ran: 10 functions - Successfull: 9, Failed: 1


CUSTOMER_ID,OTHER_DATA
1,2
1,0
1,3
1,2
1,8


In [12]:
! pip install tabulate

Defaulting to user installation because normal site-packages is not writeable


In [13]:
from tabulate import tabulate
print(tabulate(df_result.head(5), headers=df_result.columns, tablefmt="github", showindex=False))

|   CUSTOMER_ID |   OTHER_DATA |
|---------------|--------------|
|             1 |            2 |
|             1 |            0 |
|             1 |            3 |
|             1 |            2 |
|             1 |            8 |


In [14]:
for error in errors:
    print('Error for', error[0], ntk.exception_to_trace_string(error[1]))

Error for customer_id = 0
 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
    r = call_item.fn(*call_item.args, **call_item.kwargs)
  File "/tmp/ipykernel_5851/4234218793.py", line 6, in analytic_function
    raise ValueError(f"Invalid customer id {customer_id}")
ValueError: Invalid customer id 0
"""


# Analytics

Some utility functions to print custom html and also display the pandas dataframe multiple times in a cell:

In [15]:
ntk.h3("Some data:")
df.head().display()

ntk.html("<h3>Some more data:</h3>")
df.tail().display(index=False)

Unnamed: 0,A,B,C,D,E
0,ham,alpha,2023-08-05,0.358,0
1,ham,beta,2023-06-20,2.738,1
2,eggs,beta,2023-06-06,0.985,0
3,ham,alpha,2023-06-25,0.781,3
4,eggs,alpha,2023-06-09,-2.612,1


A,B,C,D,E
eggs,beta,2023-09-19,1.446,2
eggs,beta,2023-07-25,1.134,3
ham,gamma,2023-07-18,1.899,0
eggs,beta,2023-09-03,0.881,3
ham,beta,2023-09-02,-0.164,3


&nbsp;

## Some Pandas utility methods:

### `series.value_counts_percentage()` of `series.vcp()`

In [16]:
df.A.vcp()

Unnamed: 0_level_0,COUNT,PERC
A,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,50,0.5
eggs,50,0.5


### `df.to_map(col1, col2)` - creates a dictionary with col1 as keys and col2 as values

In [19]:
df.head().to_map('C', 'D')

{Timestamp('2023-08-05 00:00:00'): 0.35816408722397347,
 Timestamp('2023-06-20 00:00:00'): 2.7380984240154396,
 Timestamp('2023-06-06 00:00:00'): 0.9853544910190851,
 Timestamp('2023-06-25 00:00:00'): 0.7812662462024029,
 Timestamp('2023-06-09 00:00:00'): -2.6120551186406185}

&nbsp; 

### `df.rsort(...)` - a shortcut for reverse sort

In [20]:
df.rsort('B').head()

Unnamed: 0,A,B,C,D,E
73,ham,gamma,2023-07-17,-1.202,1
26,eggs,gamma,2023-07-02,0.591,0
19,ham,gamma,2023-09-30,-0.416,0
59,eggs,gamma,2023-07-09,0.639,3
78,ham,gamma,2023-09-10,-1.407,2


&nbsp; 

### `series.mcut(...)` - a wrapper over the `series.cut(..)` function.

This function makes it easier to divide the data into custom bins. The `bins` parameter is a list of numbers which form the boundaries of bins. 

In [21]:
cuts = df.D.mcut(bins=[-2, -1, 0, 1, 2])
cuts.to_frame().value_counts().sort_index()

D      
-1 - 0     27
-2 - -1    17
0 - 1      34
1 - 2      16
>= 2        1
nan         5
Name: count, dtype: int64

# Files

Simple utilities to quickly read/write data in files.  

In [22]:
content = """
This is some text.
"""
ntk.write_to_file(filepath='/tmp/tmp_file.txt', content=content)

content = """
This is some more text. 
"""
ntk.append_to_file(filepath='/tmp/tmp_file.txt', content=content, strip=False)

In [23]:
print(ntk.read_file(filepath='/tmp/tmp_file.txt'))

This is some text.

This is some more text. 



&nbsp;

`write_to_excel` - A utility function to make it easy to write a multi-sheet excel file

In [24]:
dfs_map = {
    'sheet1': create_random_data(n_rows=20),
    'sheet2': create_random_data(n_rows=20),
    'sheet3': create_random_data(n_rows=20),
}

ntk.write_dfs_to_excel(dfs_map=dfs_map, file_name='/tmp/data.xlsx')

2023-12-10 12:48:33.778629 :: MainThread ::  Write to excel - /tmp/data.xlsx
2023-12-10 12:48:33.799297 :: MainThread ::  Write to excel - writing sheet - sheet1
2023-12-10 12:48:33.807456 :: MainThread ::  Write to excel - writing sheet - sheet2
2023-12-10 12:48:33.815731 :: MainThread ::  Write to excel - writing sheet - sheet3
2023-12-10 12:48:33.892762 :: MainThread ::  Write to excel - done
