In [1]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer

import warnings

warnings.filterwarnings("ignore")

In [2]:
data_path = r"C:\Users\rohit\settleking_test\data\processed\account_sorted.csv"

In [3]:
sorted_acc_df = pd.read_csv(data_path, parse_dates = ["Date"])

Transaction_Type: 

1 -> Credit

0 -> Debit

Is_Recurring: 

1 -> True

0 -> False

In [4]:
len(sorted_acc_df)

99

In [5]:
sorted_acc_df.head(n = 10)

Unnamed: 0,Date,Amount,abs_Amount,Cheque_Number,Transaction_Type,Cleaned_Desc,Description,Day,Month,Year,...,Gap_Days,Rec_30,Rec_60,Rec_90,RecurringStreak_30,RecurringStreak_60,RecurringStreak_90,Is_Recurring_30,Is_Recurring_60,Is_Recurring_90
0,2024-01-24,5000.0,5000.0,,1,9llabs llc direct dep 93153174682489f sv,9LLABS LLC DIRECT DEP 240124 93153174682489F SV,24,January,2024,...,,False,False,False,0,0,0,0,0,0
1,2024-02-01,20000.0,20000.0,,1,online transfer from 9llabs llc business check...,ONLINE TRANSFER FROM 9LLABS LLC BUSINESS CHECK...,1,February,2024,...,8.0,False,False,False,0,0,0,0,0,0
2,2024-02-27,-1457.25,1457.25,,0,business to business ach wa state dol wa st dm...,BUSINESS TO BUSINESS ACH WA STATE DOL WA ST DM...,27,February,2024,...,26.0,True,False,False,1,0,0,0,0,0
3,2024-05-01,25000.0,25000.0,,1,online transfer from 9llabs llc ref business c...,ONLINE TRANSFER FROM 9LLABS LLC REF #IB0N2W75G...,1,May,2024,...,64.0,False,True,False,1,1,0,0,0,0
4,2024-05-08,-20520.0,20520.0,,0,online transfer to 9llabs llc ref business che...,ONLINE TRANSFER TO 9LLABS LLC REF #IB0N5DX48M ...,8,May,2024,...,7.0,False,False,False,1,1,0,0,0,0
5,2024-07-01,12500.0,12500.0,,1,online transfer from 9llabs llc business check...,ONLINE TRANSFER FROM 9LLABS LLC BUSINESS CHECK...,1,July,2024,...,54.0,False,True,False,1,2,0,0,1,0
6,2024-07-08,-9975.0,9975.0,,0,online transfer to 9llabs llc ref business che...,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NSKMG2J ...,8,July,2024,...,7.0,False,False,False,1,2,0,0,1,0
7,2024-07-24,-3875.0,3875.0,,0,online transfer to 9llabs llc ref business che...,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NYCGZRL ...,24,July,2024,...,16.0,False,False,False,1,2,0,0,1,0
8,2024-02-27,-199.25,199.25,,0,business to business ach wa state dol wa st dm...,BUSINESS TO BUSINESS ACH WA STATE DOL WA ST DM...,27,February,2024,...,,False,False,False,0,0,0,0,0,0
9,2024-03-11,-525.0,525.0,,0,business to business ach ag fintax llc ooff tr...,BUSINESS TO BUSINESS ACH AG FINTAX LLC J2091 O...,11,March,2024,...,13.0,False,False,False,0,0,0,0,0,0


In [6]:
sorted_acc_df["Previous_Day"]

0            NaN
1     2024-01-24
2     2024-02-01
3     2024-02-27
4     2024-05-01
         ...    
94           NaN
95    2024-05-20
96           NaN
97    2024-02-01
98    2024-04-04
Name: Previous_Day, Length: 99, dtype: object

In [7]:
sorted_acc_df.columns

Index(['Date', 'Amount', 'abs_Amount', 'Cheque_Number', 'Transaction_Type',
       'Cleaned_Desc', 'Description', 'Day', 'Month', 'Year', 'Category_1',
       'Category_2', 'Category_3', 'round_Amount', 'Previous_Day', 'Gap_Days',
       'Rec_30', 'Rec_60', 'Rec_90', 'RecurringStreak_30',
       'RecurringStreak_60', 'RecurringStreak_90', 'Is_Recurring_30',
       'Is_Recurring_60', 'Is_Recurring_90'],
      dtype='object')

In [8]:
regression_data = sorted_acc_df[[
    "Date","Amount",
    "abs_Amount",
    "Transaction_Type",
    "Cleaned_Desc","Day",
    "Month",
    "Category_1",
    "Category_2",
    "Category_3",
    "round_Amount",
    "Gap_Days",
    "Is_Recurring_30",
    "Is_Recurring_60",
    "Is_Recurring_90"
    ]]

In [9]:
regression_data.head(n = 10)

Unnamed: 0,Date,Amount,abs_Amount,Transaction_Type,Cleaned_Desc,Day,Month,Category_1,Category_2,Category_3,round_Amount,Gap_Days,Is_Recurring_30,Is_Recurring_60,Is_Recurring_90
0,2024-01-24,5000.0,5000.0,1,9llabs llc direct dep 93153174682489f sv,24,January,9LLabs,Bank Transfer,,5000.0,,0,0,0
1,2024-02-01,20000.0,20000.0,1,online transfer from 9llabs llc business check...,1,February,9LLabs,Bank Transfer,,20000.0,8.0,0,0,0
2,2024-02-27,-1457.25,1457.25,0,business to business ach wa state dol wa st dm...,27,February,9LLabs,B2B Payment,Bank Transfer,-1457.25,26.0,0,0,0
3,2024-05-01,25000.0,25000.0,1,online transfer from 9llabs llc ref business c...,1,May,9LLabs,Bank Transfer,,25000.0,64.0,0,0,0
4,2024-05-08,-20520.0,20520.0,0,online transfer to 9llabs llc ref business che...,8,May,9LLabs,Bank Transfer,,-20520.0,7.0,0,0,0
5,2024-07-01,12500.0,12500.0,1,online transfer from 9llabs llc business check...,1,July,9LLabs,Bank Transfer,,12500.0,54.0,0,1,0
6,2024-07-08,-9975.0,9975.0,0,online transfer to 9llabs llc ref business che...,8,July,9LLabs,Bank Transfer,,-9975.0,7.0,0,1,0
7,2024-07-24,-3875.0,3875.0,0,online transfer to 9llabs llc ref business che...,24,July,9LLabs,Bank Transfer,,-3875.0,16.0,0,1,0
8,2024-02-27,-199.25,199.25,0,business to business ach wa state dol wa st dm...,27,February,B2B Payment,Bank Transfer,,-199.25,,0,0,0
9,2024-03-11,-525.0,525.0,0,business to business ach ag fintax llc ooff tr...,11,March,B2B Payment,Bank Transfer,,-525.0,13.0,0,0,0


In [10]:
regression_data = regression_data.dropna(subset = "Gap_Days")

In [11]:
x_data = regression_data.drop(["Is_Recurring_30","Is_Recurring_60","Is_Recurring_90","Date","Cleaned_Desc","Category_2","Category_3"], axis = 1)

y_data = regression_data["Is_Recurring_30"]

In [12]:
scaler = StandardScaler()

In [13]:
x_data[["scaled_Amount","scaled_abs_Amount","scaled_Gap_Days"]] = scaler.fit_transform(x_data[["Amount","abs_Amount","Gap_Days"]])

In [14]:
x_data = x_data.drop(["Amount","abs_Amount","Gap_Days"], axis = 1)

In [15]:
x_data.head(n = 10)

Unnamed: 0,Transaction_Type,Day,Month,Category_1,round_Amount,scaled_Amount,scaled_abs_Amount,scaled_Gap_Days
1,1,1,February,9LLabs,20000.0,1.432743,0.975803,-0.59856
2,0,27,February,9LLabs,-1457.25,-0.005211,-0.431991,0.417631
3,1,1,May,9LLabs,25000.0,1.767818,1.35541,2.562924
4,0,8,May,9LLabs,-20520.0,-1.282698,1.015282,-0.655015
5,1,1,July,9LLabs,12500.0,0.930132,0.406392,1.998373
6,0,8,July,9LLabs,-9975.0,-0.576027,0.21469,-0.655015
7,0,24,July,9LLabs,-3875.0,-0.167236,-0.248432,-0.146919
9,0,11,March,B2B Payment,-525.0,0.057264,-0.502769,-0.316284
11,0,9,February,Bank Transfer,-13000.0,-0.778747,0.444352,-0.48565
12,0,13,February,Bank Transfer,-11000.0,-0.644717,0.292509,-0.82438


In [16]:
encoder = OneHotEncoder(sparse_output = False, drop = None, handle_unknown = "ignore")

In [17]:
cols = x_data.select_dtypes(include = "object").columns

In [18]:
encoded_data = encoder.fit_transform(x_data[cols])

encoded_cols = encoder.get_feature_names_out(cols)

In [19]:
encoded_cols

array(['Month_April', 'Month_February', 'Month_July', 'Month_June',
       'Month_March', 'Month_May', 'Category_1_9LLabs',
       'Category_1_B2B Payment', 'Category_1_Bank Transfer',
       'Category_1_Card Payment', 'Category_1_Fees',
       'Category_1_Insurance', 'Category_1_International Transfer',
       'Category_1_Investments/Finance', 'Category_1_Miscellaneous',
       'Category_1_P2P Payments', 'Category_1_Rewards',
       'Category_1_SettleKing', 'Category_1_Toll',
       'Category_1_Utility Payment'], dtype=object)

In [20]:
encoded_cols_df = pd.DataFrame(encoded_data, columns = encoded_cols, index = x_data.index )

In [21]:
encoded_x_data = x_data.drop(cols, axis = 1).join(encoded_cols_df)

In [22]:
encoded_x_data = encoded_x_data.dropna(subset = "scaled_Gap_Days")

In [23]:
encoded_x_data

Unnamed: 0,Transaction_Type,Day,round_Amount,scaled_Amount,scaled_abs_Amount,scaled_Gap_Days,Month_April,Month_February,Month_July,Month_June,...,Category_1_Fees,Category_1_Insurance,Category_1_International Transfer,Category_1_Investments/Finance,Category_1_Miscellaneous,Category_1_P2P Payments,Category_1_Rewards,Category_1_SettleKing,Category_1_Toll,Category_1_Utility Payment
1,1,1,20000.00,1.432743,0.975803,-0.598560,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,27,-1457.25,-0.005211,-0.431991,0.417631,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,25000.00,1.767818,1.355410,2.562924,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,8,-20520.00,-1.282698,1.015282,-0.655015,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,1,12500.00,0.930132,0.406392,1.998373,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0,17,-2000.00,-0.041583,-0.390784,-0.429195,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
92,1,8,9895.00,0.755558,0.208616,0.135356,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
95,0,5,-1.25,0.092363,-0.542533,1.546733,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
97,0,4,-616.97,0.051100,-0.495786,2.506469,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
train_x, test_x, train_y, test_y = train_test_split(encoded_x_data, y_data, test_size = 0.3, random_state = 123, stratify = y_data)

In [25]:
print(f"train data size : {train_x.shape} \ntest data size : {test_x.shape}")

train data size : (58, 26) 
test data size : (25, 26)


In [27]:
model = LogisticRegression()

model.fit(train_x, train_y)

y_pred = model.predict(test_x)

In [28]:
print(f"Classication Report - /n {classification_report(test_y,y_pred)}")

Classication Report - /n               precision    recall  f1-score   support

           0       0.83      1.00      0.90        19
           1       1.00      0.33      0.50         6

    accuracy                           0.84        25
   macro avg       0.91      0.67      0.70        25
weighted avg       0.87      0.84      0.81        25

