In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
from pathlib import Path

In [2]:
tqdm.pandas()

# Load trained model

In [3]:
with open('../Training/trained_model/the_best_randomforest.pickle', 'rb') as file:
    random_forest = pickle.load(file)

# Load target data

In [4]:
data_path = Path("../Dataset/")

In [5]:
target_data = pd.read_csv('../Dataset/fraud_holdout_no_label.csv').drop(columns=['Unnamed: 0'])

In [6]:
target_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address
0,159135,2015-05-21 6:03,2015-07-09 8:05,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0
1,50116,2015-08-01 22:40,2015-08-27 3:37,11,IWKVZHJOCLPUR,Ads,Chrome,F,19,3987484000.0
2,182338,2015-01-25 17:49,2015-03-23 23:05,62,NRFFPPHZYFUVC,Ads,IE,M,31,341674700.0
3,199700,2015-07-11 18:26,2015-10-28 21:59,13,TEPSJVVXGNTYR,Ads,Safari,F,35,1819009000.0
4,73884,2015-05-29 16:22,2015-06-16 5:45,58,ZTZZJUCRDOCJZ,Direct,Chrome,M,32,4038285000.0


# Preprocessing

## Time interval

In [7]:
target_data['signup_time_dt'] = pd.to_datetime(target_data['signup_time'])
target_data['purchase_time_dt'] = pd.to_datetime(target_data['purchase_time'])
target_data['days_signup_purchase'] = target_data['purchase_time_dt'] - target_data['signup_time_dt']
target_data['minutes_signup_purchase'] = target_data['days_signup_purchase'].dt.total_seconds()/60

## Device count

In [8]:
device_map = target_data['device_id'].value_counts().to_dict()
target_data['device_count'] = target_data['device_id'].map(device_map)

## Country

In [9]:
def get_ip_country(ip):
    lower_bound = ip_address[ip_address['lower_bound_ip_address']<=ip]
    upper_bound = lower_bound[lower_bound['upper_bound_ip_address']>=ip]
    try:
        country = upper_bound['country'].to_numpy()[0]
    except:
        country = 'Not available'
    return country

In [10]:
ip_address = pd.read_excel(data_path/"Candidate_tech_evaluation_candidate_copy_datascience_IpAddress_to_Country.xlsx")

In [11]:
target_data['ip_country'] = target_data['ip_address'].progress_apply(get_ip_country)

100%|███████████████████████████████████| 31112/31112 [00:25<00:00, 1226.41it/s]


In [12]:
target_data['ip_country'].sample(10)

13303    United States
28872    United States
26906    Not available
15979    United States
22404    United States
24073    United States
28617    United States
4439     Not available
25716    United States
13960            China
Name: ip_country, dtype: object

## Load encoder

In [13]:
with open("../Preprocessing/encoder/country_onehot.pickle", 'rb') as file:
    enc = pickle.load(file)
country_one_hot = pd.DataFrame(enc.transform(target_data[['ip_country']]).toarray())
country_one_hot = country_one_hot.add_prefix('country_')
country_one_hot



Unnamed: 0,country_0,country_1,country_2,country_3,country_4,country_5,country_6,country_7,country_8,country_9,...,country_167,country_168,country_169,country_170,country_171,country_172,country_173,country_174,country_175,country_176
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
target_data['signup_time_h_dt'] = target_data['signup_time_dt'].dt.hour
target_data['purchase_time_h_dt'] = target_data['purchase_time_dt'].dt.hour

In [15]:
def set_time_group(time):
    if time in range(6, 12):
        return 'morning'
    elif time in range(12, 18):
        return 'afternoon'
    elif time in range(18, 24):
        return 'evening'
    else:
        return 'night'

In [16]:
target_data['sign_time_range'] = target_data['signup_time_h_dt'].apply(set_time_group)
target_data['purchase_time_range'] = target_data['purchase_time_h_dt'].apply(set_time_group)

In [17]:
with open("../Preprocessing/encoder/signtime_onehot.pickle", 'rb') as file:
    enc = pickle.load(file)
sign_time_one_hot = pd.DataFrame(enc.transform(target_data[['sign_time_range']]).toarray())
sign_time_one_hot = sign_time_one_hot.add_prefix('signTime_')
sign_time_one_hot

Unnamed: 0,signTime_0,signTime_1,signTime_2,signTime_3
0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
31107,1.0,0.0,0.0,0.0
31108,0.0,1.0,0.0,0.0
31109,0.0,1.0,0.0,0.0
31110,0.0,0.0,1.0,0.0


In [18]:
with open("../Preprocessing/encoder/purchasetime_onehot.pickle", 'rb') as file:
    enc = pickle.load(file)
purchase_time_one_hot = pd.DataFrame(enc.transform(target_data[['purchase_time_range']]).toarray())
purchase_time_one_hot = purchase_time_one_hot.add_prefix('purchaseTime_')
purchase_time_one_hot

Unnamed: 0,purchaseTime_0,purchaseTime_1,purchaseTime_2,purchaseTime_3
0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
31107,0.0,0.0,0.0,1.0
31108,1.0,0.0,0.0,0.0
31109,0.0,0.0,1.0,0.0
31110,0.0,0.0,1.0,0.0


In [19]:
with open("../Preprocessing/encoder/source_onehot.pickle", 'rb') as file:
    enc = pickle.load(file)
source_one_hot = pd.DataFrame(enc.transform(target_data[['source']]).toarray())
source_one_hot = source_one_hot.add_prefix('source_')
source_one_hot

Unnamed: 0,source_0,source_1,source_2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
31107,0.0,0.0,1.0
31108,0.0,0.0,1.0
31109,0.0,1.0,0.0
31110,0.0,1.0,0.0


In [20]:
preprocessed_data = target_data[['minutes_signup_purchase', 'device_count', 
                           'purchase_value']].join(sign_time_one_hot)
preprocessed_data = preprocessed_data.join(purchase_time_one_hot)
preprocessed_data = preprocessed_data.join(source_one_hot)
preprocessed_data = preprocessed_data.join(country_one_hot)

In [21]:
pred = random_forest.predict(preprocessed_data)

In [22]:
answer = pd.DataFrame(pred, columns=['prediction'])

In [23]:
answer.value_counts()

prediction
0             29528
1              1584
Name: count, dtype: int64

# SAVE result as answer_sheet

In [24]:
answer.to_csv('answer_sheet.csv', index=False)