#### Importing Libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
import numpy as np
import json

#### Read the data

In [224]:
pd.set_option("display.max_rows", None)
df = pd.read_csv('data.csv',sep=':')
df.head()

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
0,10000,10000,10468,10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,10000,10000,10468,10001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,10000,10000,10468,10003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10000,10000,10468,10004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,10000,10000,10468,10005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12034 entries, 0 to 12033
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   receipt_id               12034 non-null  object 
 1   company_id               12034 non-null  int64  
 2   matched_transaction_id   12034 non-null  object 
 3   feature_transaction_id   12034 non-null  object 
 4   DateMappingMatch         12034 non-null  float64
 5   AmountMappingMatch       12034 non-null  float64
 6   DescriptionMatch         12034 non-null  float64
 7   DifferentPredictedTime   12034 non-null  float64
 8   TimeMappingMatch         12034 non-null  float64
 9   PredictedNameMatch       12034 non-null  float64
 10  ShortNameMatch           12034 non-null  float64
 11  DifferentPredictedDate   12034 non-null  float64
 12  PredictedAmountMatch     12034 non-null  float64
 13  PredictedTimeCloseMatch  12034 non-null  float64
dtypes: float64(10), int64(

In [21]:
df.describe()

Unnamed: 0,company_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
count,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0
mean,27247.797906,0.217901,0.03166,0.021522,0.986455,0.013877,0.024215,0.037893,0.753532,0.001005,0.076533
std,12024.542988,0.384535,0.122611,0.116995,0.115597,0.116987,0.128646,0.190945,0.430972,0.020134,0.26586
min,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,30000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,30000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,50000.0,1.0,0.9,0.8,1.0,1.0,0.8,1.0,1.0,0.6,1.0


#### We understand that matched_transaction_id and feature_transaction_id columns have to be matched as per documentation. SO creating new column in similar terms

In [127]:
def map_transactionid_func(x):
    if x['matched_transaction_id'] == x['feature_transaction_id']:
        return 1
    else:
        return 0
df['TransactionIdMapping'] = df.apply(lambda x: map_transactionid_func(x),axis=1)
#nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(df1_std)
#distances, indices = nbrs.kneighbors(df1_std)
#nbrs.kneighbors_graph(df1_std).toarray()

#### Understanding value counts of recipt id's if all have multiple instances

In [234]:
pd.set_option('display.max_rows', 10000)
pd.DataFrame(df['receipt_id'].value_counts()).head()

Unnamed: 0,receipt_id
30081,25
30393,25
30303,25
30053,23
30203,23


#### Converting reciept id string to int for easier classification

In [137]:
df['receipt_id_1'] = df['receipt_id'].apply(lambda x: int(str(x).replace(',','')))

#### Choosen columns for unsupervised classification, as we don't have labeled data. The rest all columns are just unique numbers and already incurred

In [140]:
df1 = df[['receipt_id_1','company_id','TransactionIdMapping','DateMappingMatch','AmountMappingMatch','DescriptionMatch','DifferentPredictedTime','TimeMappingMatch','PredictedNameMatch','ShortNameMatch','DifferentPredictedDate','PredictedAmountMatch','PredictedTimeCloseMatch']]
df1.head()

Unnamed: 0,receipt_id_1,company_id,TransactionIdMapping,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
0,10000,10000,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,10000,10000,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,10000,10000,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10000,10000,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,10000,10000,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Planing to use unsupervised classification based on nearest neighbours algorithm. Our data is not labeled but we could still find similar rows based on matching feature vectors s said in pdf. KDTree works well for this kind of similar matches. (We could use even standardization if needed before)

In [228]:
#kdt = KDTree(array_df1_std, leaf_size=30, metric='euclidean')
kdt = KDTree(df1, leaf_size=30)

In [229]:
dist,ind = kdt.query(df1[152:153], k=5)
print(dist)
k =list(ind[0])
print(k)

df.iloc[k,:]
#df.iloc[108:115]

[[0.  0.4 1.  1.  1. ]]
[152, 151, 136, 169, 153]


Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
152,10010,10000,10405,10404,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
151,10010,10000,10405,10403,0.95,0.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
136,10009,10000,10484,10482,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
169,10011,10000,10659,10658,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
153,10010,10000,10405,10405,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### A sample function to directly find top matches for any new inputs. It will take matching evctor and give top matches

In [230]:
def get_top_matches(sample_data):
    dist,ind = kdt.query(pd.json_normalize(sample_data),k = 5)
    matched_ids = list(ind[0])
    res_df = df_match.merge(df_dist, left_index=True, right_index=True)
    return res_df

In [233]:

sample_data = json.loads(df1[152:153].to_json(orient='records'))[0]
sample_data_1 = {'receipt_id_1': 10010,
 'company_id': 10000,
 'TransactionIdMapping': 0,
 'DateMappingMatch': 0.95,
 'AmountMappingMatch': 0.0,
 'DescriptionMatch': 0.0,
 'DifferentPredictedTime': 1.0,
 'TimeMappingMatch': 0.0,
 'PredictedNameMatch': 0.0,
 'ShortNameMatch': 0.0,
 'DifferentPredictedDate': 0.0,
 'PredictedAmountMatch': 0.0,
 'PredictedTimeCloseMatch': 0.0}

res = get_top_matches(sample_data_1)
print('Top matches:')
res


Top matches:


Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,TransactionIdMapping,receipt_id_1,distance
0,10010,10000,10405,10404,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10010,0.0
1,10010,10000,10405,10403,0.95,0.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10010,0.4
2,10009,10000,10484,10482,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10009,1.0
3,10011,10000,10659,10658,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10011,1.0
4,10010,10000,10405,10405,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10010,1.0
