### **1. Importing Liabraries**

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing

### **2. Loading Datasets**


In [18]:
data=pd.read_csv("../data/clean_data.csv",index_col=0,parse_dates=['trans_date_trans_time','dob'])
data.head(10)
print("Dataset has " , data.shape[0] , " rows and ",data.shape[1] , " columns")
print('column names: \n') 
print('\n'.join(list(data.columns)))

Dataset has  1852394  rows and  29  columns
column names: 

trans_date_trans_time
cc_num
merchant
category
amt
first
last
gender
street
city
state
zip
lat
long
city_pop
job
dob
trans_num
unix_time
merch_lat
merch_long
is_fraud
label
age
trans_year
trans_month
trans_days
trans_week_days
trans_hour


### **3. Deleting Unnecessary Columns**

In [19]:
# list of the columns to be dropped

drop_cols = ['trans_date_trans_time','street','merchant','zip','first','last','trans_num','job','dob','city','trans_days','trans_year']

In [20]:
data.drop(drop_cols, axis = 1, inplace = True)
data.reset_index(drop=True, inplace = True)
list(data.columns) # lets look at the remaining list of columns


['cc_num',
 'category',
 'amt',
 'gender',
 'state',
 'lat',
 'long',
 'city_pop',
 'unix_time',
 'merch_lat',
 'merch_long',
 'is_fraud',
 'label',
 'age',
 'trans_month',
 'trans_week_days',
 'trans_hour']

In [21]:
data.head()

Unnamed: 0,cc_num,category,amt,gender,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,label,age,trans_month,trans_week_days,trans_hour
0,2703186189652095,misc_net,4.97,F,NC,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,Train,30.0,1,Tuesday,0
1,630423337322,grocery_pos,107.23,F,WA,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,Train,40.0,1,Tuesday,0
2,38859492057661,entertainment,220.11,M,ID,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,Train,56.0,1,Tuesday,0
3,3534093764340240,gas_transport,45.0,M,MT,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,Train,51.0,1,Tuesday,0
4,375534208663984,misc_pos,41.96,M,VA,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,Train,32.0,1,Tuesday,0


### **6. Split the Train - Test Dataset**

In [22]:
train_data = data[data['label'] == 'Train'] # splitting data in Train dataset

test_data = data[data['label'] == 'Test'] # test in label


In [23]:
train_data.drop('label', axis =1, inplace = True) # dropping the label column in Train dataset

test_data.drop('label', axis =1, inplace = True) # dropping the label column in Test dataset

### **4. Creating Dummy Variables**

In [24]:
#convert category to dummy variables in both Train and Test datasets

train_data=pd.get_dummies(train_data, drop_first=True)
test_data=pd.get_dummies(test_data, drop_first=True)

In [26]:
test_data.head()

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,...,state_WA,state_WI,state_WV,state_WY,trans_week_days_Monday,trans_week_days_Saturday,trans_week_days_Sunday,trans_week_days_Thursday,trans_week_days_Tuesday,trans_week_days_Wednesday
1296675,2291163933867244,2.86,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0,52.0,...,0,0,0,0,0,0,1,0,0,0
1296676,3573030041201292,29.84,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0,30.0,...,0,0,0,0,0,0,1,0,0,0
1296677,3598215285024754,41.28,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0,49.0,...,0,0,0,0,0,0,1,0,0,0
1296678,3591919803438423,60.05,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0,32.0,...,0,0,0,0,0,0,1,0,0,0
1296679,3526826139003047,3.19,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0,64.0,...,0,0,0,0,0,0,1,0,0,0


### **5. Standardizing**

StandardScaler follows Standard Normal Distribution (SND). Therefore, it makes mean = 0 and scales the data to unit variance.

In [27]:
scaler = preprocessing.StandardScaler()
standard_df = scaler.fit_transform(train_data)
train_data = pd.DataFrame(standard_df,columns=train_data.columns)

In [28]:
scaler = preprocessing.StandardScaler()
standard_df = scaler.fit_transform(test_data)
test_data = pd.DataFrame(standard_df,columns=test_data.columns)


### **7. Saving Data**

In [29]:
# save the data to a new csv file
data.to_csv("../data/train_data.csv")
data.to_csv("../data/test_data.csv")

In [30]:
test_data

Unnamed: 0,cc_num,amt,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,...,state_WA,state_WI,state_WV,state_WY,trans_week_days_Monday,trans_week_days_Saturday,trans_week_days_Sunday,trans_week_days_Thursday,trans_week_days_Tuesday,trans_week_days_Wednesday
0,-0.317252,-0.424463,-0.904377,0.677451,0.816521,-1.703871,-0.894145,0.657586,-0.062248,0.321667,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,2.229166,-0.346132,-0.4971,-0.324153
1,-0.316273,-0.252337,0.351182,-1.472454,-0.292685,-1.703869,0.178126,-1.436610,-0.062248,-0.940350,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,2.229166,-0.346132,-0.4971,-0.324153
2,-0.316254,-0.179353,0.420768,1.216667,-0.178853,-1.703865,0.383257,1.167640,-0.062248,0.149574,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,2.229166,-0.346132,-0.4971,-0.324153
3,-0.316259,-0.059605,-1.970539,0.685934,-0.111371,-1.703861,-1.909485,0.680717,-0.062248,-0.825621,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,2.229166,-0.346132,-0.4971,-0.324153
4,-0.316308,-0.422358,1.128092,0.380004,-0.289942,-1.703861,1.259139,0.316510,-0.062248,1.010040,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,2.229166,-0.346132,-0.4971,-0.324153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,-0.318978,-0.163467,0.385244,-0.120967,-0.291963,1.510350,0.275527,-0.080241,-0.062248,0.436396,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,-0.448598,2.889070,-0.4971,-0.324153
555715,-0.316286,0.270803,-1.877757,-0.379599,-0.198018,1.510351,-1.742946,-0.433644,-0.062248,-1.456630,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,-0.448598,2.889070,-0.4971,-0.324153
555716,-0.314411,0.111564,1.512121,-2.089408,-0.281427,1.510352,1.592587,-2.146912,-0.062248,-0.424070,...,8.214136,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,-0.448598,2.889070,-0.4971,-0.324153
555717,-0.318998,-0.391735,1.201709,-1.910685,-0.293261,1.510354,1.163252,-1.955100,-0.062248,0.493760,...,-0.121741,-0.150885,-0.141034,-0.124289,-0.511201,-0.355237,-0.448598,2.889070,-0.4971,-0.324153
