<a href="https://colab.research.google.com/github/solankiharsh/Fraud_Detection/blob/master/tried_creating_edges.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [125]:
import argparse
import logging
import os
import pandas as pd
import numpy as np
from itertools import combinations

In [126]:
output_dir = '/content/drive/My Drive/Sunlife_ml/Experimenting_Graph/output_dir'

In [127]:
transaction_df = pd.read_csv('/content/drive/My Drive/Sunlife_ml/Experimenting_Graph/bs140513_032310.csv')
print("Shape of transaction data is {}".format(transaction_df.shape))
print("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.fraud.isnull().sum()))

Shape of transaction data is (594643, 9)
# Tagged transactions: 594643


In [128]:
transaction_df.head()

Unnamed: 0,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [129]:
unique_print_str =  ""
for column in transaction_df:
    unique_print_str += " |{}: {}| ".format(column, transaction_df[column].unique().size)
    #print(df[column].unique().size)
print('---------- Number of unique values per feature ----------')
print(unique_print_str)

---------- Number of unique values per feature ----------
 |customer: 4112|  |age: 8|  |gender: 4|  |zipcodeOri: 1|  |merchant: 50|  |zipMerchant: 1|  |category: 15|  |amount: 23767|  |fraud: 2| 


As seen above, the zipcodeOri and zipMerchant features contain only one unique value.

In [130]:
total = transaction_df.shape[0]
normal = transaction_df[transaction_df.fraud == 0].customer.count()
fraudulent = total - normal

print("The total number of datapoints are {}".format(total))
print("The number of non-fraudulent datapoints are {}, equal to {} % of the dataset".format(normal, round(100 *normal/total, 2)))
print("The number of fraudulent datapoints are {}, equal to {} % of the dataset".format(fraudulent, round(100 *fraudulent/total,2)))

The total number of datapoints are 594643
The number of non-fraudulent datapoints are 587443, equal to 98.79 % of the dataset
The number of fraudulent datapoints are 7200, equal to 1.21 % of the dataset


In [131]:
train_data_ratio = 0.8

In [132]:
transaction_df =  transaction_df.drop(['zipcodeOri', 'zipMerchant'], axis = 1)
transaction_df.head()

Unnamed: 0,customer,age,gender,merchant,category,amount,fraud
0,'C1093826151','4','M','M348934600','es_transportation',4.55,0
1,'C352968107','2','M','M348934600','es_transportation',39.68,0
2,'C2054744914','4','F','M1823072687','es_transportation',26.89,0
3,'C1760612790','3','M','M348934600','es_transportation',17.25,0
4,'C757503768','5','M','M348934600','es_transportation',35.72,0


In [138]:
# Normalizing Numerical Features¶
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()

transaction_df[['amount', 'fraud']] = scaler.fit_transform(transaction_df[['amount', 'fraud']])
features.amount = transaction_df.amount


# Show an example of a record with scaling applied
transaction_df.head()

Unnamed: 0,customer,age,gender,merchant,category,amount,fraud
0,'C1093826151','4','M','M348934600','es_transportation',0.000546,0.0
1,'C352968107','2','M','M348934600','es_transportation',0.004764,0.0
2,'C2054744914','4','F','M1823072687','es_transportation',0.003228,0.0
3,'C1760612790','3','M','M348934600','es_transportation',0.002071,0.0
4,'C757503768','5','M','M348934600','es_transportation',0.004288,0.0


In [139]:
# split the features and labels
label = transaction_df.fraud
features = transaction_df.drop('fraud', axis = 1)
features.head()

Unnamed: 0,customer,age,gender,merchant,category,amount
0,'C1093826151','4','M','M348934600','es_transportation',0.000546
1,'C352968107','2','M','M348934600','es_transportation',0.004764
2,'C2054744914','4','F','M1823072687','es_transportation',0.003228
3,'C1760612790','3','M','M348934600','es_transportation',0.002071
4,'C757503768','5','M','M348934600','es_transportation',0.004288


In [140]:
features =  features.drop(['customer'], axis = 1)
features.head()

Unnamed: 0,age,gender,merchant,category,amount
0,'4','M','M348934600','es_transportation',0.000546
1,'2','M','M348934600','es_transportation',0.004764
2,'4','F','M1823072687','es_transportation',0.003228
3,'3','M','M348934600','es_transportation',0.002071
4,'5','M','M348934600','es_transportation',0.004288


In [145]:
non_feature_cols = ['fraud', 'merchant','customer']
feature_cols = [col for col in transaction_df.columns if col not in non_feature_cols]
print("Categorical columns: {}".format(feature_cols))

Categorical columns: ['age', 'gender', 'category', 'amount']


In [146]:
transaction_df.head()

Unnamed: 0,customer,age,gender,merchant,category,amount,fraud
0,'C1093826151','4','M','M348934600','es_transportation',0.000546,0.0
1,'C352968107','2','M','M348934600','es_transportation',0.004764,0.0
2,'C2054744914','4','F','M1823072687','es_transportation',0.003228,0.0
3,'C1760612790','3','M','M348934600','es_transportation',0.002071,0.0
4,'C757503768','5','M','M348934600','es_transportation',0.004288,0.0


In [155]:
 # Get features
non_feature_cols = ['fraud', 'merchant', 'customer'] 
feature_cols = [col for col in transaction_df.columns if col not in non_feature_cols]
print("Categorical columns: {}".format(feature_cols))

Categorical columns: ['age', 'gender', 'category', 'amount']


In [156]:
features_final = pd.get_dummies(feature_cols)

In [167]:
# Get relations
edge_types = feature_cols
print("Found the following distinct relation types: {}".format(edge_types))
id_cols = ['customer'] 

# extract edges
edges = {}
for etype in edge_types:
    edgelist = transaction_df[['customer', etype]].dropna()
    edgelist.to_csv(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype), index=False, header=True)
    print("Wrote edgelist to: {}".format(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype)))
    edges[etype] = edgelist
return edges

Found the following distinct relation types: ['age', 'gender', 'category', 'amount']
Wrote edgelist to: /content/drive/My Drive/Sunlife_ml/Experimenting_Graph/output_dir/relation_age_edgelist.csv
Wrote edgelist to: /content/drive/My Drive/Sunlife_ml/Experimenting_Graph/output_dir/relation_gender_edgelist.csv
Wrote edgelist to: /content/drive/My Drive/Sunlife_ml/Experimenting_Graph/output_dir/relation_category_edgelist.csv
Wrote edgelist to: /content/drive/My Drive/Sunlife_ml/Experimenting_Graph/output_dir/relation_amount_edgelist.csv


SyntaxError: ignored

In [None]:
homogeneous_edges = []
for etype, relations in edges.items():
  for edge_relation, frame in relations.groupby(etype):
    new_edges = [(a, b) for (a, b) in combinations(frame.customer.values, 2)
              if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
    homogeneous_edges.extend(new_edges)

with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f:
  f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges))
print("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv')))

In [164]:
def create_homogeneous_edgelist(edges, output_dir):
    homogeneous_edges = []
    for etype, relations in edges.items():
        for edge_relation, frame in relations.groupby(etype):
            new_edges = [(a, b) for (a, b) in combinations(frame.TransactionID.values, 2)
                         if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
            homogeneous_edges.extend(new_edges)

    with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f:
        f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges))
    logging.info("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv')))