In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import reverse_geocoder as rg
from collections import Counter
from sklearn import preprocessing
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import pickle

#### Load data

In [2]:
file_path = '../fraudTrain.csv'

# first column useless
df = pd.read_csv(file_path,index_col = 0)

print(df.shape)
df.head()

(1296675, 22)


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


#### Data Preprocess (Feature Engineering)

In [3]:
df.info() #No null data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [4]:
## Drop irrelevant column
## we use UNIX timestamp instead of time
## city and state can represent location, identical number is useless
### Actually, it is necessary to keep them, but time is limited to process
drop_col = ['trans_date_trans_time','lat','long','merchant','street',
            'zip','city','first','trans_num','city','last']
df = df.drop(drop_col,axis=1)

In [5]:
### Transfer date of birth to age period
def map_dob(dob):
    now = 2020
    age = now - int(dob[0:4])
    return round(age / 10)

df['age_period'] = df['dob'].apply(map_dob)
df = df.drop(['dob'],axis=1)

In [6]:
##### check unique value of credit card number
print("Total unique number of credit card number: ",len(df['cc_num'].unique()))
### it is too many. Many be we can use the length of it to represent
def cc_num_to_length(cc_num):
    return len(str(cc_num))

df['cc_num_len'] = df['cc_num'].apply(cc_num_to_length)
df = df.drop(['cc_num'],axis=1)

Total unique number of credit card number:  983


In [7]:
### map merch_lat,merch_long to city
### TIME_CONSUMING!! NOT RUN
# geolocater = Nominatim(user_agent='tutorial')
# def map_lat_long(lat,long):
#     return rg.search((lat,long))[0]['cc']
    
# country = []
# for i in df.index:
#     lat,long = df.iloc[i]['merch_lat'],df.iloc[i]['merch_long']
#     country.append(map_lat_long(lat,long))

# df['merch_country'] = country
df = df.drop(['merch_lat','merch_long'],axis=1)

In [8]:
#### floating value standardization
df['amt'] = preprocessing.scale(np.array(df['amt']).reshape(-1,1))
df['city_pop'] = preprocessing.scale(np.array(df['city_pop']).reshape(-1,1))

In [9]:
### UNIX Time can be used to represent year,month,day,time_period
def get_week_of_month(year, month, day):
    begin = int(datetime.date(year, month, 1).strftime("%W"))
    end = int(datetime.date(year, month, day).strftime("%W"))
    
def extract_time_stamp(ts):
    string = datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    year,month,day,hour = int(string[0:4]),int(string[5:7]),int(string[8:10]),int(string[11:13])
    
    week_of_month = get_week_of_month(year,month,day)
    
    if 0 <= hour <= 6:
        time_period = 'EarlyMorning'
    elif 6< hour <= 12:
        time_period = 'Morning'
    elif 12 < hour <= 18:
        time_period = 'Afternoon'
    else:
        time_period = 'Night'
    
    return year,month,time_period

df['year'],df['month'],df['time_period'] = zip(*df['unix_time'].map(extract_time_stamp))
df = df.drop(['unix_time'],axis=1)

In [10]:
### Map job to inverse of frequency
dic = Counter(df['job'])
pickle.dump('../Q4_output/dict.pkl')
n = len(df)

def map_job(job):
    return 1 / (dic[job] / n)

df['job'] = df['job'].apply(map_job)

In [32]:
with open('../Q4_output/dict.pkl', "wb") as myprofile:
    #dump the dictionary for testing purpose
    pickle.dump(dic,myprofile)

In [12]:
#Since all str variables are nominal, use one-hot encoding
df = pd.get_dummies(df)

#### Split data

In [14]:
X = df[[col for col in df.columns if col != 'is_fraud']]
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1037340, 78) (259335, 78) (1037340,) (259335,)


### Test on validation data

In [33]:
clf = RandomForestClassifier(200,class_weight={0:0.05,1:0.95},random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f"Test set AUC score: {roc_auc_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.96      0.76      0.85      1520

    accuracy                           1.00    259335
   macro avg       0.98      0.88      0.93    259335
weighted avg       1.00      1.00      1.00    259335

Test set AUC score: 0.8808376595008456


#### The model look OK. Train on Full dataset and export

In [34]:
clf = RandomForestClassifier(200,class_weight={0:0.05,1:0.95},random_state=1)
clf.fit(X,y)

RandomForestClassifier(class_weight={0: 0.05, 1: 0.95}, n_estimators=200,
                       random_state=1)

In [36]:
with open('../Q4_output/model.pkl', 'wb') as f:
    #save model
    pickle.dump(clf, f)