In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from imblearn.under_sampling import RandomUnderSampler

In [4]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
print (df.head(1))
print (df.shape)

   step     type   amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1  PAYMENT  9839.64  C1231006815       170136.0       160296.36   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
(6362620, 11)


In [5]:
uniqueIds = df['nameOrig'].unique().tolist()
dfFraud = df.loc[df['isFraud'] == 1]
dfNotFraud = df.loc[df['isFraud'] == 0]
print (dfFraud.shape)

(8213, 11)


In [6]:
dfFinal = pd.concat([dfFraud.sample(n=50),dfNotFraud.sample(n=950)])
print (dfFinal.shape)

(1000, 11)


In [7]:
#preprocess
def norm(df,cols):
    for col in cols:
        df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())
    return df
dfFinal = norm(dfFinal,['amount'])
print (dfFinal['amount'].max())
print (dfFinal['amount'].min())

1.0
0.0


In [8]:
listSent = []
for row in dfFinal.iterrows():
    index, data = row
    sent = ""
    for ind, val in enumerate(data.tolist()):
        if ind in [1,3,6]:
            sent = sent + val + " " 
        elif ind == 2:
            if val >= 0.8:
                sent = sent + "HIGH "
            elif val >= 0.6:
                sent = sent + "MED "
            elif val >= 0.3:
                sent = sent + "LOW "
            else :
                sent = sent + "VLOW "
        elif ind == 9:
            if val == 1:
                sent = sent + "Fraud "
            else:
                sent = sent + "NotFraud "
    listSent.append(sent.rstrip(" "))

In [9]:
#corpus with - type, sourceid, amtrange, destid, isfraud
corpus = listSent

In [10]:
#create a word list of corpus
words = []
for text in corpus:
    for word in text.split(' '):
        words.append(word)
words = set(words)

In [11]:
#keeping window size as 4, generate all the neighbors.
word2int = {}
for i,word in enumerate(words):
    word2int[word] = i
sentences = []
for sentence in corpus:
    sentences.append(sentence.split())    
WINDOW_SIZE = 4
data = []
count=0
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])
    count+=1
                
import pandas as pd
df = pd.DataFrame(data, columns = ['input', 'label'])   
print(df.head(10))                
print(df.shape)

         input        label
0     TRANSFER          MED
1     TRANSFER  C1185526711
2     TRANSFER   C342454180
3     TRANSFER        Fraud
4          MED     TRANSFER
5          MED  C1185526711
6          MED   C342454180
7          MED        Fraud
8  C1185526711     TRANSFER
9  C1185526711          MED
(20000, 2)


In [12]:
#Draw graph using tensorflow
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [13]:
ONE_HOT_DIM = len(words)
# function to convert numbers to one hot vectors
def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding
X = [] # input word
Y = [] # target word
for x, y in zip(df['input'], df['label']):
    X.append(to_one_hot_encoding(word2int[ x ]))
    Y.append(to_one_hot_encoding(word2int[ y ]))
# convert them to numpy arrays
X_train = np.asarray(X)
Y_train = np.asarray(Y)
# making placeholders for X_train and Y_train
x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 2 
# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)
# output layer
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random_normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))
# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))
# training operation
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

In [None]:
#training phase
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 
iteration = 5000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 100 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

iteration 0 loss is :  9.349032
iteration 100 loss is :  8.006699
iteration 200 loss is :  7.568957


In [None]:
#word vector in a table
w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2'])
w2v_df['word'] = words
w2v_df = w2v_df[['word', 'x1', 'x2']]
w2v_df
# if w2v_df.any(w2v_df['word'])== 'fraud':
#     print(w2v_df['word'])
# print(w2v_df.loc[w2v_df['word'] == 'Fraud'])
    

In [None]:
#wordvector in 2d chart
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for word, x1, x2 in zip(w2v_df['word'], w2v_df['x1'], w2v_df['x2']):
#     ax.annotate(word, (x1,x2 ))    
    if word == "Fraud":
        ax.plot(x1,x2,"ro")
    elif word in ['HIGH','MED','LOW','VLOW']:
        ax.plot(x1,x2,"go")
    else:
        ax.plot(x1,x2,"b.")
PADDING = 1.0
x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
plt.xlim(x_axis_min,x_axis_max)
plt.ylim(y_axis_min,y_axis_max)
plt.rcParams["figure.figsize"] = (10,10)
plt.show()

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
X = w2v_df.loc[w2v_df['word'] == 'Fraud']
#print(w2v_df)
df = X.loc[:, ('x1', 'x2')]
k = df.values.tolist()
#print(df)
list = []
l = []
# for row in range(len(w2v_df)):
#     list.append([w2v_df['x1'][1],w2v_df['x2'][2]])
# euclidean_distances(list, k)
#print([w2v_df['x1'][1],w2v_df['x2'][2]])
from scipy.spatial import distance
for row in range(len(w2v_df)):
    #print(distance.euclidean([w2v_df['x1'][row],w2v_df['x2'][row]],k))
    list.append([distance.euclidean([w2v_df['x1'][row],w2v_df['x2'][row]],k),w2v_df['word'][row]])
#print(l)
list = sorted(list)
print(list)
a = [1, 2]
b = [4, 5]
dst = distance.euclidean(a, b)
#print(dst)