## AITD End to End Example Notebook

In [3]:
from src.transform_predict import transform, score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.4 kB ? eta -:--:--
   -------------------------------- ------- 112.6/139.4 kB 3.3 MB/s eta 0:00:01
   ---------------------------------------- 139.4/139.4 kB 2.7 MB/s eta 0:00:00
Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
   ---------------------------------------- 0.0/215.0 kB ? eta -:--:--
   ---------------------------------------- 215.0/215.0 kB 6.6 MB/s eta 0:00:00
Downloading widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   -------------

In [19]:
## Read Dataset - Required Columns are: ['tx_description', 'sender_id', 'receiver_id', 'tx_date', 'amount']
data = pd.read_csv("data/example_input.csv")
data.head()

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount
0,lunch,1,101,2023-02-13 02:03:15,1190.63
1,dinner,1,101,2023-03-29 17:48:10,268.4
2,gift,1,100,2023-01-20 17:41:33,1276.55
3,rent,1,102,2023-01-17 08:13:35,1020.42
4,rent,1,101,2023-01-17 15:45:40,983.1


## preprocess
```
def pre_process_txn(df: pd.DataFrame) -> pd.DataFrame:
    """Reads transactions from a csv file into a pandas df
    Parameters:
        df (pd.DataFrame): input dataframe
    Returns:
        pd.DataFrame: Pandas DF containing all the transactions
    """
    dc.logger.info("Preprocessing data...")
    df[dc.txn_data_ind_col] = df.index
    df[dc.txn_data_amount_col] = (
        df[dc.txn_data_amount_col]
        .replace(",", "", regex=True)
        .apply(pd.to_numeric, errors="coerce")
    )
    df[["tx_date"]] = df[["tx_date"]].apply(pd.to_datetime)
    dc.logger.info(f"Converting amount to float; df is now has shape {str(df.shape)}")

    return df
```

In [20]:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# local imports
from src import config as dc
from src.sentiment import *
from src.process_transactions import *

In [21]:
data.index

RangeIndex(start=0, stop=513, step=1)

In [22]:
dc.txn_data_ind_col

'Z_receipt_number'

In [23]:
data[dc.txn_data_ind_col] = data.index
data

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.4,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.1,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


In [24]:
dc.txn_data_amount_col

'amount'

In [25]:
data["amount"].dtype

dtype('O')

In [26]:
data[dc.txn_data_amount_col] = (
    data[dc.txn_data_amount_col]
    .replace(",", "", regex=True)
    .apply(pd.to_numeric, errors="coerce")
)

In [27]:
data["amount"].dtype

dtype('float64')

In [28]:
data

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.40,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.10,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


In [29]:
data[["tx_date"]] = data[["tx_date"]].apply(pd.to_datetime)

In [30]:
data

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.40,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.10,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


## sanity check
```
def sanity_check(df: pd.DataFrame, score_month: int = None, score_year: int = None, lag: int = 2) -> None:
    """Warns user if some data will be discard
    Parameters:
        df (pd.DataFrame): input dataframe
    Returns:
        pd.DataFrame: Pandas DF containing all the transactions
    """
    
    dc.logger.info("Data sanity check...")
    if score_month is None: 
        date_r = df["tx_date"].max()
        dc.logger.info("Score month not Specified, using month " + str(date_r.month))
    else: 
        if score_year is None: 
            date_r = max(df[df['tx_date'].dt.month == score_month]['tx_date'])
        else: 
            date_r = max(df[(df['tx_date'].dt.month == score_month) & (df['tx_date'].dt.year == score_year)]['tx_date'])
    
    date_l = (date_r - relativedelta(months=(lag))).replace(day=1)
    if_discard = len(df) - len(df[(df["tx_date"] > date_l) & (df["tx_date"] <= date_r)])

    # Inform User about options selected
    if if_discard:
        msg = f"Found latest transaction date: {date_r:%Y-%m-%d}, dicard {if_discard} data points with date before {date_l:%Y-%m-%d}. If a sender-receiver relation only exist in data prior to {date_l:%Y-%m-%d} or after {date_r:%Y-%m-%d}, they won't appear in the final output"
        dc.logger.warn(msg)
        print(f"Warning: {msg}")
```

In [33]:
date_r = data["tx_date"].max()
date_r

Timestamp('2023-03-30 16:18:45')

In [34]:
from dateutil.relativedelta import relativedelta
lag=2
date_r - relativedelta(months=(lag))

Timestamp('2023-01-30 16:18:45')

In [35]:
date_l = (date_r - relativedelta(months=(lag))).replace(day=1)
date_l

Timestamp('2023-01-01 16:18:45')

In [37]:
data["tx_date"] > date_l

0      True
1      True
2      True
3      True
4      True
       ... 
508    True
509    True
510    True
511    True
512    True
Name: tx_date, Length: 513, dtype: bool

In [39]:
data["tx_date"] <= date_r

0      True
1      True
2      True
3      True
4      True
       ... 
508    True
509    True
510    True
511    True
512    True
Name: tx_date, Length: 513, dtype: bool

In [42]:
data[ ((data["tx_date"] <= date_l) | (data["tx_date"] > date_r))]

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
388,repayments 20,2,100,2023-01-01 02:14:41,641.48,388
432,saving for house,1,100,2023-01-01 13:18:57,1380.9,432
467,BBQ money,2,101,2023-01-01 09:55:34,932.83,467


In [36]:
if_discard = len(data) - len(data[(data["tx_date"] > date_l) & (data["tx_date"] <= date_r)])
if_discard

3

## get_emotion_feature

In [45]:
emo_copy=data.copy()

In [46]:
emo_copy

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.40,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.10,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


In [47]:
emo_df = pd.DataFrame()

In [48]:
import pandas as pd
import h2o
import torch
import sys
from multiprocessing import Process, Queue
from typing import Any

from src import config, simpletext, publicmodels, sentiment
from src import process_transactions as pt
from src.process_transactions import *

In [57]:
transactions_copy=emo_copy.copy()
transactions_copy

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.40,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.10,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


In [61]:
import pandas as pd
import numpy as np
import wordninja
import re
import GPUtil
import torch
from typing import Dict

# from pandarallel import pandarallel
from transformers import pipeline

# local imports
from src import detoxify
from src import config as dc

def unicode_escape(text: str) -> str:
    try:
        return text.encode("utf-8").decode("unicode_escape")
    except Exception:
        return text
        
def preprocessing(text: str) -> str:
    if text == "":
        return text
    text = unicode_escape(text)
    text = re.sub(r"\d+", "", text)  # Remove numbers

    # Split long messages that have no spaces
    tokens = []
    for word in text.split(" "):
        if len(word) > 5:
            tokens += wordninja.split(word)
        else:
            tokens.append(word)

    return " ".join(x for x in tokens)

In [62]:
transactions_copy["processed_description"] = transactions_copy["tx_description"].apply(preprocessing)

In [63]:
transactions_copy

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,processed_description
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,lunch
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,dinner
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,gift
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,rent
4,rent,1,101,2023-01-17 15:45:40,983.10,4,rent
...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,grocery share
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,transfer joint
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,This makes me so damn angry Â All of this over...
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"Let's hit it, Jake"


In [64]:
transactions_copy = transactions_copy.reset_index(drop=True)
transactions_copy

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,processed_description
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,lunch
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,dinner
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,gift
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,rent
4,rent,1,101,2023-01-17 15:45:40,983.10,4,rent
...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,grocery share
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,transfer joint
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,This makes me so damn angry Â All of this over...
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"Let's hit it, Jake"


In [65]:
n_procs = max(
    int(len(transactions_copy) / 100), 10
)

n_procs

10

In [66]:
trans_df_batches = np.array_split(transactions_copy, n_procs)
trans_df_batches

[                                       tx_description  sender_id  receiver_id  \
 0                                               lunch          1          101   
 1                                              dinner          1          101   
 2                                                gift          1          100   
 3                                                rent          1          102   
 4                                                rent          1          101   
 5                           Payment for Lucy birthday          2          102   
 6                                       repaying loan          1          102   
 7                                    saving for house          2          101   
 8                                   Birthday surprise          1          102   
 9                                                rent          1          101   
 10                                      rent Brisbane          2          102   
 11             

## emotion model

In [68]:
torch.cuda.is_available()

True

In [70]:
model_path = dc.emo_model_path
classifier = pipeline(
            "text-classification",
            model= model_path,
            return_all_scores=False,
            binary_output=True,
            device=-1,
        )   # -1 for cpu, 1 for gpu
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x29f66ecf340>

In [93]:
emotions_results = pd.DataFrame(
    {"label": [], "score": [], dc.txn_data_ind_col: []}
)
emotions_results

Unnamed: 0,label,score,Z_receipt_number


In [94]:
batched_text = trans_df_batches[0]
batched_text

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,processed_description
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,lunch
1,dinner,1,101,2023-03-29 17:48:10,268.4,1,dinner
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,gift
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,rent
4,rent,1,101,2023-01-17 15:45:40,983.1,4,rent
5,Payment for Lucy birthday,2,102,2023-01-02 23:51:39,834.3,5,Payment for Lucy birthday
6,repaying loan,1,102,2023-03-10 22:39:23,808.32,6,repaying loan
7,saving for house,2,101,2023-02-26 16:10:38,1242.26,7,saving for house
8,Birthday surprise,1,102,2023-01-21 11:42:53,1308.43,8,Birthday surprise
9,rent,1,101,2023-03-08 20:24:27,1350.01,9,rent


In [95]:
test_result = classifier(list(batched_text.processed_description))

In [96]:
test_result

[{'label': 'anger', 'score': 0.5088672041893005},
 {'label': 'anger', 'score': 0.5708519816398621},
 {'label': 'joy', 'score': 0.9308176636695862},
 {'label': 'anger', 'score': 0.8230971693992615},
 {'label': 'anger', 'score': 0.8230971693992615},
 {'label': 'joy', 'score': 0.8535721302032471},
 {'label': 'joy', 'score': 0.8594326376914978},
 {'label': 'joy', 'score': 0.9208135008811951},
 {'label': 'joy', 'score': 0.9686031341552734},
 {'label': 'anger', 'score': 0.8230971693992615},
 {'label': 'anger', 'score': 0.8155257701873779},
 {'label': 'love', 'score': 0.9608993530273438},
 {'label': 'love', 'score': 0.9608993530273438},
 {'label': 'anger', 'score': 0.8230971693992615},
 {'label': 'joy', 'score': 0.6315029859542847},
 {'label': 'anger', 'score': 0.7851678729057312},
 {'label': 'anger', 'score': 0.7536722421646118},
 {'label': 'joy', 'score': 0.6825711727142334},
 {'label': 'anger', 'score': 0.43168649077415466},
 {'label': 'anger', 'score': 0.49687445163726807},
 {'label': 'an

In [97]:
temp = pd.DataFrame(test_result)
temp

Unnamed: 0,label,score
0,anger,0.508867
1,anger,0.570852
2,joy,0.930818
3,anger,0.823097
4,anger,0.823097
5,joy,0.853572
6,joy,0.859433
7,joy,0.920814
8,joy,0.968603
9,anger,0.823097


In [98]:
temp["Z_receipt_number"] = list(batched_text["Z_receipt_number"])
temp

Unnamed: 0,label,score,Z_receipt_number
0,anger,0.508867,0
1,anger,0.570852,1
2,joy,0.930818,2
3,anger,0.823097,3
4,anger,0.823097,4
5,joy,0.853572,5
6,joy,0.859433,6
7,joy,0.920814,7
8,joy,0.968603,8
9,anger,0.823097,9


In [99]:
emotions_results

Unnamed: 0,label,score,Z_receipt_number


In [100]:
emotions_results = pd.concat([emotions_results, temp])
emotions_results

Unnamed: 0,label,score,Z_receipt_number
0,anger,0.508867,0.0
1,anger,0.570852,1.0
2,joy,0.930818,2.0
3,anger,0.823097,3.0
4,anger,0.823097,4.0
5,joy,0.853572,5.0
6,joy,0.859433,6.0
7,joy,0.920814,7.0
8,joy,0.968603,8.0
9,anger,0.823097,9.0


In [101]:
dummy = pd.get_dummies(emotions_results["label"])
dummy

Unnamed: 0,anger,fear,joy,love
0,1,0,0,0
1,1,0,0,0
2,0,0,1,0
3,1,0,0,0
4,1,0,0,0
5,0,0,1,0
6,0,0,1,0
7,0,0,1,0
8,0,0,1,0
9,1,0,0,0


In [102]:
emotions = pd.concat([emotions_results, dummy], axis=1)
emotions

Unnamed: 0,label,score,Z_receipt_number,anger,fear,joy,love
0,anger,0.508867,0.0,1,0,0,0
1,anger,0.570852,1.0,1,0,0,0
2,joy,0.930818,2.0,0,0,1,0
3,anger,0.823097,3.0,1,0,0,0
4,anger,0.823097,4.0,1,0,0,0
5,joy,0.853572,5.0,0,0,1,0
6,joy,0.859433,6.0,0,0,1,0
7,joy,0.920814,7.0,0,0,1,0
8,joy,0.968603,8.0,0,0,1,0
9,anger,0.823097,9.0,1,0,0,0


In [103]:
emotions.drop(["label", "score"], axis=1, inplace=True)
emotions

Unnamed: 0,Z_receipt_number,anger,fear,joy,love
0,0.0,1,0,0,0
1,1.0,1,0,0,0
2,2.0,0,0,1,0
3,3.0,1,0,0,0
4,4.0,1,0,0,0
5,5.0,0,0,1,0
6,6.0,0,0,1,0
7,7.0,0,0,1,0
8,8.0,0,0,1,0
9,9.0,1,0,0,0


## put emotions results through a loop instead

In [107]:
def emotion_model(trans_df_batches):
    model_path = "bhadresh-savani/distilbert-base-uncased-emotion"

    classifier = pipeline(
        "text-classification",
        model= model_path,
        return_all_scores=False,
        binary_output=True,
        device=-1,
    )  # -1 for cpu, 1 for gpu
    emotions_results = pd.DataFrame(
        {"label": [], "score": [], "Z_receipt_number": []}
    )

    counts = 0
    for batched_text in trans_df_batches:
        try:
            emotions_result = classifier(list(batched_text.processed_description))
            temp = pd.DataFrame(emotions_result)
            temp["Z_receipt_number"] = list(batched_text["Z_receipt_number"])
            emotions_results = pd.concat([emotions_results, temp])
        except Exception as e:
            dc.logger.error(e)
            return emotions_result

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        # if counts % 100 == 0:
        #     # print(counts)
        counts += 1

    dummy = pd.get_dummies(emotions_results["label"])
    emotions = pd.concat([emotions_results, dummy], axis=1)
    emotions.drop(["label", "score"], axis=1, inplace=True)

    dc.logger.info("Calculated Emotions Model Predictions")

    return emotions

In [108]:
emotions=emotion_model(trans_df_batches)

In [109]:
emotions

Unnamed: 0,Z_receipt_number,anger,fear,joy,love,sadness
0,0.0,1,0,0,0,0
1,1.0,1,0,0,0,0
2,2.0,0,0,1,0,0
3,3.0,1,0,0,0,0
4,4.0,1,0,0,0,0
...,...,...,...,...,...,...
46,508.0,1,0,0,0,0
47,509.0,0,0,1,0,0
48,510.0,1,0,0,0,0
49,511.0,0,0,1,0,0


## Detoxify features

In [115]:
burt_model_loc=dc.tox_model_dir
checkpoint=model_path=dc.tox_model_path
device="cpu"
burt_model_loc, checkpoint, device

('models/toxicity_model/transformers/',
 'models/toxicity_model/toxic_bias-4e693588.ckpt',
 'cpu')

In [117]:
if burt_model_loc[-1] != "/":
    burt_model_loc = burt_model_loc + "/"

In [118]:
loaded = torch.load(checkpoint)
loaded

{'state_dict': OrderedDict([('roberta.embeddings.position_ids',
               tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 1

In [119]:
class_names = loaded["config"]["dataset"]["args"]["classes"]
class_names

['toxicity',
 'severe_toxicity',
 'obscene',
 'identity_attack',
 'insult',
 'threat',
 'sexual_explicit']

In [123]:
args=loaded["config"]["arch"]["args"]
args

{'num_classes': 16,
 'model_type': 'roberta-base',
 'model_name': 'RobertaForSequenceClassification',
 'tokenizer_name': 'RobertaTokenizer'}

In [129]:
# get_model_and_tokenizer
model_type=args['model_type']
num_classes=args['num_classes']
tokenizer_name=args['tokenizer_name']
model_name=args['model_name']
state_dict=loaded["state_dict"]
model_type, num_classes, tokenizer_name, model_name, state_dict

('roberta-base',
 16,
 'RobertaTokenizer',
 'RobertaForSequenceClassification',
 OrderedDict([('roberta.embeddings.position_ids',
               tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                        

In [125]:
burt_model_location = burt_model_loc + model_type
burt_model_location

'models/toxicity_model/transformers/roberta-base'

In [127]:
import transformers

model_class = getattr(transformers, model_name)
model_class

transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification

In [130]:
model = model_class.from_pretrained(
    pretrained_model_name_or_path=burt_model_location,
    num_labels=num_classes,
    state_dict=state_dict,
)
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [131]:
tokenizer = getattr(transformers, tokenizer_name).from_pretrained(
    burt_model_loc + "roberta-base"
)
tokenizer

PreTrainedTokenizer(name_or_path='models/toxicity_model/transformers/roberta-base', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [133]:
toxicity = pd.DataFrame(
    {dc.txn_data_ind_col: [], **{cl: [] for cl in class_names}}
)
toxicity

Unnamed: 0,Z_receipt_number,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit


In [137]:
## Try first batch
batched_text=trans_df_batches[0]
descriptions=list(batched_text.processed_description)
list(enumerate(descriptions))

[(0, 'lunch'),
 (1, 'dinner'),
 (2, 'gift'),
 (3, 'rent'),
 (4, 'rent'),
 (5, 'Payment for Lucy birthday'),
 (6, 'repaying loan'),
 (7, 'saving for house'),
 (8, 'Birthday surprise'),
 (9, 'rent'),
 (10, 'rent Brisbane'),
 (11, 'I love you'),
 (12, 'I love you'),
 (13, 'rent'),
 (14, 'Online Mobile Re charge'),
 (15, 'school lunches'),
 (16, 'phone bill'),
 (17, 'joint account'),
 (18, 'rec ip t '),
 (19, 'Mackay West State School School Uniforms Richards'),
 (20, 'Mackay West State School School Uniforms Ryan'),
 (21, 'Mackay West State School School Uniforms James'),
 (22, 'school uniforms '),
 (23, 'Fucking good night'),
 (24, 'ILY xxx ooo o'),
 (25, 'lunch'),
 (26, 'Happy Birthday B**ch'),
 (27, 'BF FLY F LYSFM Happy Birthday'),
 (28, 'lunch friday'),
 (29, 'coffee'),
 (30, 'friday dinner'),
 (31, 'repayment groceries'),
 (32, 'savings holiday'),
 (33, 'Groceries'),
 (34, 'Savings'),
 (35, 'Hey whats going on?'),
 (36, 'LOVE you'),
 (37, 'lol'),
 (38, 'thanks for lunch'),
 (39, 're

In [138]:
model.eval()
out=None
with torch.inference_mode():
    inputs = tokenizer(
            descriptions, return_tensors="pt", truncation=True, padding=True
        )
    out = model(**inputs)[0]
out

tensor([[-3.9445e+00, -1.2834e+01, -7.5632e+00, -9.5036e+00, -4.3504e+00,
         -7.6955e+00, -9.3504e+00, -8.3783e+00, -9.4563e+00, -1.1282e+01,
         -9.9226e+00, -1.1423e+01, -9.8614e+00, -1.0885e+01, -1.0334e+01,
         -8.2181e+00],
        [-5.0089e+00, -1.2573e+01, -7.8921e+00, -1.0174e+01, -6.3873e+00,
         -7.2165e+00, -8.5878e+00, -7.9497e+00, -9.2169e+00, -1.1985e+01,
         -7.2448e+00, -1.1233e+01, -1.0299e+01, -1.1610e+01, -1.0563e+01,
         -8.9799e+00],
        [-7.4951e+00, -1.3582e+01, -9.7936e+00, -1.0558e+01, -8.6846e+00,
         -9.8648e+00, -1.0647e+01, -8.8375e+00, -9.1000e+00, -1.0785e+01,
         -9.3741e+00, -1.0226e+01, -9.8223e+00, -1.0807e+01, -1.0487e+01,
         -9.2158e+00],
        [-6.2486e+00, -1.3688e+01, -9.0994e+00, -1.0155e+01, -7.1321e+00,
         -9.4708e+00, -1.0523e+01, -8.8375e+00, -9.3977e+00, -1.0721e+01,
         -1.0377e+01, -1.0833e+01, -9.9236e+00, -1.1187e+01, -1.0502e+01,
         -9.2878e+00],
        [-6.2486e+00

In [139]:
scores = torch.sigmoid(out).cpu().detach().numpy()
scores

array([[1.89924948e-02, 2.66798929e-06, 5.18958492e-04, 7.45804064e-05,
        1.27370255e-02, 4.54646070e-04, 8.69264040e-05, 2.29749945e-04,
        7.81911440e-05, 1.25981005e-05, 4.90525526e-05, 1.09389048e-05,
        5.21464899e-05, 1.87277292e-05, 3.24993161e-05, 2.69648765e-04],
       [6.63378136e-03, 3.46474394e-06, 3.73548566e-04, 3.81596146e-05,
        1.67991302e-03, 7.33836088e-04, 1.86328602e-04, 3.52654402e-04,
        9.93320718e-05, 6.23489859e-06, 7.13390356e-04, 1.32274390e-05,
        3.36741286e-05, 9.07405138e-06, 2.58481741e-05, 1.25901337e-04],
       [5.55469655e-04, 1.26270811e-06, 5.58059983e-05, 2.59763474e-05,
        1.69147825e-04, 5.19703972e-05, 2.37769182e-05, 1.45158425e-04,
        1.11648398e-04, 2.07109697e-05, 8.48908167e-05, 3.62136598e-05,
        5.42237613e-05, 2.02532574e-05, 2.78832813e-05, 9.94457005e-05],
       [1.92942505e-03, 1.13575766e-06, 1.11725501e-04, 3.88909975e-05,
        7.98418361e-04, 7.70609695e-05, 2.69212233e-05, 1.451

In [141]:
results = {}
for i, cla in enumerate(class_names):
    results[cla] = (
        scores[0][i]
        if isinstance(descriptions, str)
        else [scores[ex_i][i].tolist() for ex_i in range(len(scores))]
    )
results

{'toxicity': [0.01899249479174614,
  0.006633781362324953,
  0.0005554696545004845,
  0.0019294250523671508,
  0.0019294250523671508,
  0.0004168666491750628,
  0.0007243338041007519,
  0.00065560732036829,
  0.0004707813204731792,
  0.0019294250523671508,
  0.0005230425158515573,
  0.0004565404378809035,
  0.0004565404378809035,
  0.0019294250523671508,
  0.0004641209961846471,
  0.11421597748994827,
  0.0005796686164103448,
  0.0003978454042226076,
  0.00042088821646757424,
  0.0006728973821736872,
  0.0007836280856281519,
  0.0008569672354497015,
  0.00048552590305916965,
  0.9569419622421265,
  0.0005164654576219618,
  0.018992485478520393,
  0.5195876359939575,
  0.0023087162990123034,
  0.005703770089894533,
  0.00046220028889365494,
  0.00046834078966639936,
  0.001038042944855988,
  0.0004416874435264617,
  0.012122184969484806,
  0.0008177227573469281,
  0.0005602777819149196,
  0.0007350444793701172,
  0.0007882024510763586,
  0.0005361041403375566,
  0.0019294250523671508,
 

In [145]:
results.keys()

dict_keys(['toxicity', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit'])

In [151]:
dict_to_append = {
                dc.txn_data_ind_col: batched_text[dc.txn_data_ind_col],
                **results,
            }


In [152]:
dict_to_append

{'Z_receipt_number': 0      0
 1      1
 2      2
 3      3
 4      4
 5      5
 6      6
 7      7
 8      8
 9      9
 10    10
 11    11
 12    12
 13    13
 14    14
 15    15
 16    16
 17    17
 18    18
 19    19
 20    20
 21    21
 22    22
 23    23
 24    24
 25    25
 26    26
 27    27
 28    28
 29    29
 30    30
 31    31
 32    32
 33    33
 34    34
 35    35
 36    36
 37    37
 38    38
 39    39
 40    40
 41    41
 42    42
 43    43
 44    44
 45    45
 46    46
 47    47
 48    48
 49    49
 50    50
 51    51
 Name: Z_receipt_number, dtype: int64,
 'toxicity': [0.01899249479174614,
  0.006633781362324953,
  0.0005554696545004845,
  0.0019294250523671508,
  0.0019294250523671508,
  0.0004168666491750628,
  0.0007243338041007519,
  0.00065560732036829,
  0.0004707813204731792,
  0.0019294250523671508,
  0.0005230425158515573,
  0.0004565404378809035,
  0.0004565404378809035,
  0.0019294250523671508,
  0.0004641209961846471,
  0.11421597748994827,
  0.000579668616

In [153]:
toxicity = pd.concat([toxicity, pd.DataFrame(dict_to_append)])
toxicity

Unnamed: 0,Z_receipt_number,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
0,0.0,0.018992,2.667989e-06,0.000519,7.5e-05,0.012737,0.000455,8.7e-05
1,1.0,0.006634,3.464744e-06,0.000374,3.8e-05,0.00168,0.000734,0.000186
2,2.0,0.000555,1.262708e-06,5.6e-05,2.6e-05,0.000169,5.2e-05,2.4e-05
3,3.0,0.001929,1.135758e-06,0.000112,3.9e-05,0.000798,7.7e-05,2.7e-05
4,4.0,0.001929,1.135758e-06,0.000112,3.9e-05,0.000798,7.7e-05,2.7e-05
5,5.0,0.000417,1.261875e-06,3.8e-05,2.9e-05,0.000117,3.8e-05,1.9e-05
6,6.0,0.000724,1.262856e-06,6.8e-05,2.8e-05,0.0002,5.8e-05,2.7e-05
7,7.0,0.000656,9.600117e-07,4.3e-05,2.9e-05,0.000181,4.1e-05,2.1e-05
8,8.0,0.000471,1.046066e-06,3.1e-05,3e-05,0.000145,4e-05,1.6e-05
9,9.0,0.001929,1.135758e-06,0.000112,3.9e-05,0.000798,7.7e-05,2.7e-05


## sentiment

In [154]:
transactions_copy

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,processed_description
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,lunch
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,dinner
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,gift
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,rent
4,rent,1,101,2023-01-17 15:45:40,983.10,4,rent
...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,grocery share
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,transfer joint
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,This makes me so damn angry Â All of this over...
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"Let's hit it, Jake"


In [155]:
sentiment_df = pd.DataFrame()

In [156]:
sent_obj = "Z_sentiment_object"
sent_neg_col = "neg"
sent_compound_col = "compound"
sent_obj, sent_neg_col, sent_compound_col

('Z_sentiment_object', 'neg', 'compound')

In [159]:
lexicon_file="vader_lexicon.txt"
emoji_lexicon="emoji_utf8_lexicon.txt"

In [161]:
import pandas as pd
import numpy as np
from typing import Tuple, Callable, Any
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [162]:
sent_analyzer = SentimentIntensityAnalyzer()

In [163]:
txn_df=data.copy()

In [164]:
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0
1,dinner,1,101,2023-03-29 17:48:10,268.40,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2
3,rent,1,102,2023-01-17 08:13:35,1020.42,3
4,rent,1,101,2023-01-17 15:45:40,983.10,4
...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511


In [165]:
txn_df[sent_obj] = txn_df[dc.txn_description].apply(
    lambda x: sent_analyzer.polarity_scores(x)
)

In [166]:
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,Z_sentiment_object
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound..."
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,rent,1,101,2023-01-17 15:45:40,983.10,4,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp..."
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,"{'neg': 0.375, 'neu': 0.625, 'pos': 0.0, 'comp..."
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [167]:
txn_df[sent_neg_col] = txn_df[sent_obj].apply(lambda x: x["neg"])
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,Z_sentiment_object,neg
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.000
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000
4,rent,1,101,2023-01-17 15:45:40,983.10,4,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000
...,...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp...",0.000
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,"{'neg': 0.375, 'neu': 0.625, 'pos': 0.0, 'comp...",0.375
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000


In [168]:
txn_df[sent_compound_col] = txn_df[sent_obj].apply(lambda x: x["compound"])
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,Z_sentiment_object,neg,compound
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.000,0.4404
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000
4,rent,1,101,2023-01-17 15:45:40,983.10,4,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000
...,...,...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,"{'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'comp...",0.000,0.2960
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,"{'neg': 0.375, 'neu': 0.625, 'pos': 0.0, 'comp...",0.375,-0.9767
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000,0.0000


In [169]:
txn_df = txn_df.drop([sent_obj], axis=1)
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,neg,compound
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,0.000,0.0000
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,0.000,0.0000
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,0.000,0.4404
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,0.000,0.0000
4,rent,1,101,2023-01-17 15:45:40,983.10,4,0.000,0.0000
...,...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,0.000,0.2960
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,0.000,0.0000
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,0.375,-0.9767
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,0.000,0.0000


In [170]:
sentiment_df=txn_df

## text features

In [172]:
text_feat_df = pd.DataFrame()
txn_df=data.copy()
transactions=txn_df

In [175]:
from src.simpletext import simple_text_features
st = simple_text_features(transactions.tx_description)

In [177]:
transactions["length_transaction"] = st.length()
transactions["mixedcase"] = st.mixed_case()
transactions["lowercase"] = st.lower()
transactions["upper_case"] = st.upper()
transactions["number_words"] = st.numWords()
transactions["punctuation_found"] = st.punctuationsjd()
transactions["all_punctuation_or_number"] = st.all_punctuation_or_number()
transactions["longest_word"] = st.find_longest_word()
transactions["number_contains"] = st.number_contains()
transactions["words_prop_length"] = st.proportion_word_length()
transactions["num_trans"] = 1

In [178]:
txn_df

Unnamed: 0,tx_description,sender_id,receiver_id,tx_date,amount,Z_receipt_number,length_transaction,mixedcase,lowercase,upper_case,number_words,punctuation_found,all_punctuation_or_number,longest_word,number_contains,words_prop_length,num_trans
0,lunch,1,101,2023-02-13 02:03:15,1190.63,0,5,0,1,0,1,0,1,5,1,0.200000,1
1,dinner,1,101,2023-03-29 17:48:10,268.40,1,6,0,1,0,1,0,1,6,1,0.166667,1
2,gift,1,100,2023-01-20 17:41:33,1276.55,2,4,0,1,0,1,0,1,4,1,0.250000,1
3,rent,1,102,2023-01-17 08:13:35,1020.42,3,4,0,1,0,1,0,1,4,1,0.250000,1
4,rent,1,101,2023-01-17 15:45:40,983.10,4,4,0,1,0,1,0,1,4,1,0.250000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,grocery share,1,101,2023-01-14 02:39:30,1146.08,508,13,0,1,0,2,0,1,7,1,0.153846,1
509,transfer joint,2,101,2023-03-17 20:52:57,845.73,509,14,0,1,0,2,0,1,8,1,0.142857,1
510,This makes me so damn angry. All of this over...,2,100,2023-01-08 22:46:26,202.58,510,271,1,0,0,48,1,1,13,1,0.177122,1
511,"Let's hit it, Jake",1,100,2023-02-25 10:06:10,237.16,511,18,1,0,0,4,1,1,5,1,0.222222,1


### Long Term Transaction Abuse Detection

In [5]:
feature_generation_3months = transform(data, lag=2, score_month=3, score_year=2023)
feature_generation_3months.head()

| ID | GPU | MEM |
------------------
|  0 |  6% |  5% |
| ID | GPU | MEM |
------------------
|  0 |  6% |  5% |


Unnamed: 0,sender_id,receiver_id,year_month,toxicity_percentile,severe_toxicity_percentile,obscene_percentile,identity_attack_percentile,insult_percentile,threat_percentile,sexual_explicit_percentile,...,month_min2_surprise_sum,month_min2_sadness_sum,month_min2_insult_percentile,month_min2_sexual_explicit_percentile,month_min2_neutral_sum,month_min2_toxicity_percentile,month_min2_identity_attack_percentile,month_min2_severe_toxicity_percentile,month_min2_obscene_percentile,month_min2_love_sum
0,1,100,202303,0.08569,0.370133,0.159817,0.106089,0.050668,0.461281,0.48631,...,0.0,0.0,0.068578,0.474642,0.0,0.08655,0.135917,0.558185,0.140661,0.0
1,1,101,202303,0.104996,0.055807,0.22705,0.155467,0.054306,0.164829,0.146896,...,0.0,0.0,0.05826,0.672668,0.0,0.093823,0.353545,0.553346,0.46611,0.375
2,1,102,202303,0.019711,0.019863,0.053331,0.0,0.018815,0.0,0.021091,...,0.0,0.0,0.095701,0.57132,0.0,0.158918,0.258648,0.322056,0.319087,0.28125
3,2,100,202303,0.084638,0.327041,0.191018,0.358058,0.048903,0.521809,0.444424,...,0.0,0.535714,0.068578,0.139542,0.0,0.08655,0.135917,0.164645,0.102347,0.642857
4,2,101,202303,0.411683,0.740516,0.68121,0.392687,0.256471,0.951393,0.978038,...,0.0,0.0,0.05826,0.259429,0.0,0.08655,0.179198,0.164645,0.161746,0.321429


In [4]:
predictions_3month = score(feature_generation_3months, model_loc='models/CBA_AITD_Long.zip')
predictions_3month.head()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_361"; Java(TM) SE Runtime Environment (build 1.8.0_361-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from /Users/genevieverichards/opt/anaconda3/envs/aitd/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmpf8g4elha
  JVM stdout: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmpf8g4elha/h2o_genevieverichards_started_from_python.out
  JVM stderr: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmpf8g4elha/h2o_genevieverichards_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,12 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,2 months and 16 days
H2O_cluster_name:,H2O_from_python_genevieverichards_xyvzqe
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.096 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%
H2O session _sid_b838 closed.


Unnamed: 0,sender_id,receiver_id,probability_non_abuse,probability_abuse
0,1,100,0.630252,0.369748
5,2,102,0.635904,0.364096
4,2,101,0.642964,0.357036
1,1,101,0.666243,0.333757
2,1,102,0.679424,0.320576


### Short Term Transaction Abuse Detection

In [5]:
# Note: For Short Term Abuse detection model, set lag to 0.
# Note: By default, the features will be generated for the latest month in the dataset, for the example data, this is March.
feature_generation_month = transform(data, lag=0)
feature_generation_month.head()



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0
2023
3


Unnamed: 0,sender_id,receiver_id,year_month,toxicity_percentile,severe_toxicity_percentile,obscene_percentile,identity_attack_percentile,insult_percentile,threat_percentile,sexual_explicit_percentile,...,recip_num_trans_sum,recip_identity_attack_percentile,recip_sadness_sum,recip_length_transaction_median,recip_surprise_sum,recip_joy_max,recip_day_between_max_min,recip_longest_word_median,recip_anger_sum,recip_love_sum
0,1,100,202303,0.08569,0.37013,0.159817,0.106089,0.050668,0.461281,0.48631,...,0.6,0.106089,1.0,0.666667,,0.033333,,0.666667,0.229039,0.6
1,1,101,202303,0.104995,0.055806,0.227049,0.155466,0.054305,0.164828,0.146895,...,1.0,0.155466,0.416667,0.583333,,0.027778,,0.666667,0.15985,1.0
2,1,102,202303,0.019711,0.019863,0.053331,0.0,0.018816,0.0,0.021091,...,0.6,0.0,0.0,1.0,,0.033333,,0.666667,0.644172,0.3
3,2,100,202303,0.084638,0.32704,0.191018,0.35806,0.048903,0.521809,0.444424,...,0.133333,0.35806,0.0,0.333333,,0.043478,,0.333333,0.806615,0.782609
4,2,101,202303,0.411683,0.740513,0.68121,0.392687,0.256472,0.951393,0.978037,...,0.666667,0.392687,0.0,0.666667,,0.032258,,0.333333,0.845043,0.580645


In [6]:
predictions_month_march = score(feature_generation_month, model_loc='models/CBA_AITD_Short.zip')
predictions_month_march.head()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_361"; Java(TM) SE Runtime Environment (build 1.8.0_361-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from /Users/genevieverichards/opt/anaconda3/envs/aitd/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmp_p64132q
  JVM stdout: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmp_p64132q/h2o_genevieverichards_started_from_python.out
  JVM stderr: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmp_p64132q/h2o_genevieverichards_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,12 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,2 months and 16 days
H2O_cluster_name:,H2O_from_python_genevieverichards_2xfj41
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.096 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%
H2O session _sid_94a3 closed.


Unnamed: 0,sender_id,receiver_id,probability_non_abuse,probability_abuse
0,1,100,0.628646,0.371354
4,2,101,0.661149,0.338851
1,1,101,0.679388,0.320612
5,2,102,0.695596,0.304404
2,1,102,0.720643,0.279357


#### Specifying the Score Month

In [7]:
# Note: For Short Term Abuse detection model, set lag to 0.
# Note: To Score for another month, you need to specify the month in score month, in the following example it will generate features for Feburary. 
# Note: You can also pass score_year if your data contains multiple years
feature_generation_month_feb = transform(data, lag=0, score_month=2, score_year=2023)
predictions_month_feb = score(feature_generation_month_feb, model_loc='models/CBA_AITD_Short.zip')
predictions_month_feb.head()



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0
2023
2
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_361"; Java(TM) SE Runtime Environment (build 1.8.0_361-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from /Users/genevieverichards/opt/anaconda3/envs/aitd/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmppfw9onhz
  JVM stdout: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmppfw9onhz/h2o_genevieverichards_started_from_python.out
  JVM stderr: /var/folders/ng/wrfk6d3n4p9gy7vj76hlf4z80000gn/T/tmppfw9onhz/h2o_genevieverichards_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,12 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.3
H2O_cluster_version_age:,2 months and 16 days
H2O_cluster_name:,H2O_from_python_genevieverichards_rfr0qj
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.096 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%
H2O session _sid_8aad closed.


Unnamed: 0,sender_id,receiver_id,probability_non_abuse,probability_abuse
5,2,102,0.644517,0.355483
2,1,102,0.721381,0.278619
0,1,100,0.731157,0.268843
1,1,101,0.73559,0.26441
3,2,100,0.742449,0.257551
