Feature engineering 

In [9]:
import os 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('E:\Data analytics projects\Financial Tracker\Model training\Wallet data cleaned.csv')
#Data was sorted by the amount column. Ordered it by date
df = df.sort_values(by='date').reset_index(drop=True)
df.head()

Unnamed: 0,date,note,amount,category,type,transfer,account
0,2023-10-10,POS Purchase OPOS PRESTO AUTL TORON,-20.0,Public transport,Expenses,False,Current Account/Debit
1,2023-10-10,POS Purchase GPOS HONG'S ESSO NORTH,-11.72,Ciggerates,Expenses,False,Current Account/Debit
2,2023-10-10,Miscellaneous Payment TRANSFERWISE CANADA_____...,276.25,Income,Income,False,Current Account/Debit
3,2023-10-12,POS Purchase GPOS FRESHCHOICE IND ETOBI,-23.21,Groceries,Expenses,False,Current Account/Debit
4,2023-10-13,MATCHBOX CANNABIS - STE NORTH YORK ON,-34.21,Weed,Expenses,False,Credit


In [3]:
#Created a new column. The column indicates whether the amount is higher than 100$. 
#The 100$ amount may need be change in case significant rows are higher than 100$
df['big_amount'] = df['amount'].abs() > 100
#Created a new category combining, amount value and transaction note. Transaction amount may feed the model additional context

df['combined'] = df['note'] + ': $' + df['amount'].astype(str)
df.head()

Unnamed: 0,date,note,amount,category,type,transfer,account,big_amount,combined
0,2023-10-10,POS Purchase OPOS PRESTO AUTL TORON,-20.0,Public transport,Expenses,False,Current Account/Debit,False,POS Purchase OPOS PRESTO AUTL TORON: $-20.0
1,2023-10-10,POS Purchase GPOS HONG'S ESSO NORTH,-11.72,Ciggerates,Expenses,False,Current Account/Debit,False,POS Purchase GPOS HONG'S ESSO NORTH: $-11.72
2,2023-10-10,Miscellaneous Payment TRANSFERWISE CANADA_____...,276.25,Income,Income,False,Current Account/Debit,True,Miscellaneous Payment TRANSFERWISE CANADA_____...
3,2023-10-12,POS Purchase GPOS FRESHCHOICE IND ETOBI,-23.21,Groceries,Expenses,False,Current Account/Debit,False,POS Purchase GPOS FRESHCHOICE IND ETOBI: $-23.21
4,2023-10-13,MATCHBOX CANNABIS - STE NORTH YORK ON,-34.21,Weed,Expenses,False,Credit,False,MATCHBOX CANNABIS - STE NORTH YORK ON: $-34.21


In [4]:
#Created a seperate data frame for training. 
#Because of the relatively low number of columns, decided to drop date (A few months of temporal patterns would not affect training) column too. 
#The three columns below were used with category being the target variable
p_df = df[['combined', 'big_amount', 'category']]
#Making the transaction data within 'combined' in small case for better accuracy 
p_df.loc[:, 'combined'] = p_df['combined'].str.lower()
p_df.head()

Unnamed: 0,combined,big_amount,category
0,pos purchase opos presto autl toron: $-20.0,False,Public transport
1,pos purchase gpos hong's esso north: $-11.72,False,Ciggerates
2,miscellaneous payment transferwise canada_____...,True,Income
3,pos purchase gpos freshchoice ind etobi: $-23.21,False,Groceries
4,matchbox cannabis - ste north york on: $-34.21,False,Weed


In [5]:
#There is a class imbalance in the original data frame. 
df.category.value_counts()

category
Restaurant, fast-food           46
Ciggerates                      41
Public transport                32
Interact                        29
Alcohol                         20
Taxi                            20
TRANSFER                        17
Income                          16
Shopping                        16
Groceries                       13
Charges, Fees                   13
Bill payments, Subscriptions    12
Weed                            11
Unknown                          6
Rent                             6
Phone, cell phone                6
Child Support                    5
Education, development           4
Wellness, beauty                 2
Drug-store, chemist              2
Culture, sport events            1
Name: count, dtype: int64

In [6]:
# Removing the outlier. changes made in the processed data frame
p_df = p_df[~p_df['category'].isin(['Culture, sport events'])]
p_df.head()

Unnamed: 0,combined,big_amount,category
0,pos purchase opos presto autl toron: $-20.0,False,Public transport
1,pos purchase gpos hong's esso north: $-11.72,False,Ciggerates
2,miscellaneous payment transferwise canada_____...,True,Income
3,pos purchase gpos freshchoice ind etobi: $-23.21,False,Groceries
4,matchbox cannabis - ste north york on: $-34.21,False,Weed


In [7]:
#Number of categories
#There is a class imbalance within the categories. In the first iteration of the training it was not addressed 
len(pd.unique(p_df.category))

20

In [10]:
#There are 21 categories for that using label encoding instead one hot encodings. (Higher processesing required in one hot)
#Label encoding
encoder = LabelEncoder()

# Fit the encoder on the unique categories
encoder.fit(p_df['category'].unique())

# Encode the 'category' feature
p_df.loc[:, 'category_encoded'] = encoder.transform(p_df['category'])

In [11]:
#Preproceessed dataset
p_df.head()

Unnamed: 0,combined,big_amount,category,category_encoded
0,pos purchase opos presto autl toron: $-20.0,False,Public transport,11
1,pos purchase gpos hong's esso north: $-11.72,False,Ciggerates,4
2,miscellaneous payment transferwise canada_____...,True,Income,8
3,pos purchase gpos freshchoice ind etobi: $-23.21,False,Groceries,7
4,matchbox cannabis - ste north york on: $-34.21,False,Weed,18


In [14]:
p_df.to_csv("transaction_data_feature_engineered (python).csv")
print("File overwritten. Please note the changes you make in this file")
#Make a log every time you run this line of code
#Last updated ran 03-15 -- initial commit

File overwritten. Please note the changes you make in this file
