In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import reverse_geocoder as rg
from collections import Counter
from sklearn import preprocessing
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import pickle

In [2]:
file_path = '../fraudTrain.csv' #assume it is test file

# first column useless
df = pd.read_csv(file_path,index_col = 0)

In [3]:
## Drop irrelevant column
## we use UNIX timestamp instead of time
## city and state can represent location, identical number is useless
### Actually, it is necessary to keep them, but time is limited to process
drop_col = ['trans_date_trans_time','lat','long','merchant','street',
            'zip','city','first','trans_num','city','last']
df = df.drop(drop_col,axis=1)

In [4]:
### Transfer date of birth to age period
def map_dob(dob):
    now = 2020
    age = now - int(dob[0:4])
    return round(age / 10)

df['age_period'] = df['dob'].apply(map_dob)
df = df.drop(['dob'],axis=1)

In [5]:
##### check unique value of credit card number
# print("Total unique number of credit card number: ",len(df['cc_num'].unique()))
### it is too many. Many be we can use the length of it to represent
def cc_num_to_length(cc_num):
    return len(str(cc_num))

df['cc_num_len'] = df['cc_num'].apply(cc_num_to_length)
df = df.drop(['cc_num'],axis=1)

In [6]:
### map merch_lat,merch_long to city
# geolocater = Nominatim(user_agent='tutorial')
# def map_lat_long(lat,long):
#     return rg.search((lat,long))[0]['cc']
    
# country = []
# for i in df.index:
#     lat,long = df.iloc[i]['merch_lat'],df.iloc[i]['merch_long']
#     country.append(map_lat_long(lat,long))

# df['merch_country'] = country
df = df.drop(['merch_lat','merch_long'],axis=1)

In [7]:
#### floating value standardization
df['amt'] = preprocessing.scale(np.array(df['amt']).reshape(-1,1))
df['city_pop'] = preprocessing.scale(np.array(df['city_pop']).reshape(-1,1))

In [8]:
### UNIX Time can be used to represent year,month,day,time_period
def get_week_of_month(year, month, day):
    begin = int(datetime.date(year, month, 1).strftime("%W"))
    end = int(datetime.date(year, month, day).strftime("%W"))
    
def extract_time_stamp(ts):
    string = datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    year,month,day,hour = int(string[0:4]),int(string[5:7]),int(string[8:10]),int(string[11:13])
    
    week_of_month = get_week_of_month(year,month,day)
    
    if 0 <= hour <= 6:
        time_period = 'EarlyMorning'
    elif 6< hour <= 12:
        time_period = 'Morning'
    elif 12 < hour <= 18:
        time_period = 'Afternoon'
    else:
        time_period = 'Night'
    
    return year,month,time_period

df['year'],df['month'],df['time_period'] = zip(*df['unix_time'].map(extract_time_stamp))
df = df.drop(['unix_time'],axis=1)

In [9]:
### Map job to inverse of frequency
with open('../Q4_output/dict.pkl', 'rb') as f:
    dic = pickle.load(f)
    
n = 1296675 #number of training examples

def map_job(job):
    return 1 / (dic[job] / n)

df['job'] = df['job'].apply(map_job)

In [10]:
#Since all str variables are nominal, use one-hot encoding
df = pd.get_dummies(df)

### Call model to predict

In [11]:
X = df[[col for col in df.columns if col != 'is_fraud']]

In [12]:
with open('../Q4_output/model.pkl', 'rb') as f:
    #save model
    model = pickle.load(f)

In [13]:
pred = model.predict(X)

### Output to file

In [14]:
pd.Series(pred).to_csv('../Q4_output/Q4_predicted_results.csv',index=None,header=None)