In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# Part 1 Read and Process Data

In [2]:
# Import packages
import pandas as pd

In [3]:
# read data
Fraud = pd.read_csv('Fraud_Data.csv')
IP = pd.read_csv('IpAddress_to_Country.csv')

In [4]:
# Check data types
Fraud.dtypes

user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

In [5]:
# check if IP address is unique
Fraud[Fraud.duplicated(['ip_address'], keep=False)]

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
24,171711,2015-01-11 01:51:15,2015-01-11 01:51:16,52,YPLQGKBAAULUV,Direct,IE,F,35,1.120619e+09,1
29,151705,2015-01-07 03:44:53,2015-01-07 03:44:54,48,URHCRIXOMLJMH,SEO,Chrome,F,27,2.836025e+09,1
64,181004,2015-01-07 17:45:16,2015-01-07 17:45:17,10,WETYPHOQVLWMK,Ads,FireFox,M,36,2.293333e+09,1
77,392706,2015-01-07 08:15:53,2015-01-07 08:15:54,65,VRYLIRAQJIIIE,SEO,Opera,F,30,1.213105e+09,1
83,67592,2015-01-10 23:23:25,2015-01-10 23:23:26,38,ITUMJCKWEYNDD,Ads,Safari,M,43,3.874758e+09,1
111,352206,2015-01-06 21:48:13,2015-01-06 21:48:14,45,FRITFBEXOSJEI,Ads,IE,F,22,1.367518e+09,1
134,104061,2015-01-05 00:09:57,2015-01-05 00:09:58,23,XSEQHFFOYFICY,SEO,IE,M,33,2.011989e+09,1
141,348260,2015-01-10 17:13:50,2015-01-10 17:13:51,18,VOQAAYLLJLUII,SEO,Safari,M,27,7.989386e+08,1
160,121703,2015-01-06 08:56:42,2015-01-06 08:56:43,28,XAVNTONBKCVJS,Direct,Safari,M,34,2.890012e+09,1


In [6]:
# check if data contain missing value
Fraud.isnull().values.any()

False

In [7]:
# create a dictionary to store ip address, use the range of lower and upper bond as the key
IP['IpRange'] = list(zip(IP['lower_bound_ip_address'], IP['upper_bound_ip_address']))
del IP['lower_bound_ip_address']
del IP['upper_bound_ip_address']
IpDic = IP.set_index('IpRange').to_dict().get('country')

In [8]:
# extract a sample to write code first
Fraud = Fraud.head(10000)

In [9]:
# Method to locate the key by a value
def get_rate(dic, value):
    for key in dic:
        if key[0] <= value <= key[1]:
            return dic[key]

In [10]:
# locate country by IP address
Fraud['country'] = 0
for index, ip in enumerate(Fraud['ip_address']):
    Fraud['country'][index] = get_rate(IpDic, ip)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
# summary country distributions
Fraud['country'].value_counts()

United States                      3858
China                               837
Japan                               447
United Kingdom                      309
Korea Republic of                   292
Germany                             237
France                              209
Canada                              195
Brazil                              193
Australia                           130
Italy                               124
Netherlands                         102
Russian Federation                   94
Taiwan; Republic of China (ROC)      86
India                                83
Mexico                               72
Spain                                64
Sweden                               62
South Africa                         54
Indonesia                            47
Poland                               46
Norway                               45
Argentina                            45
Viet Nam                             42
Colombia                             42


##### Conclusion:
1. We have duplicated IP addresses
2. Some IP addresses cannot be located to a known country

# Part 2 Build Machine Learning Model 

Before jumping into building a model, think about whether you can create new powerful variables. This is called feature engineering and it is the most important step in machine learning. However, feature engineering is quite time consuming. In a take-home you should just give an idea of how you would do it and emphasize that with more time you would go deeper into it.

A few obvious variables that can be created here could be:
1. Time difference between sign-up time and purchase time
2. If the device id is unique or certain users are sharing the same device (many different user ids using the same device could be an indicator of fake accounts)
3. Same for the ip address. Many different users having the same ip address could be an indicator of fake accounts
4. Usual week of the year and day of the week from time variables

In [12]:
# convert time to correct datatype
Fraud['signup_time'] =  pd.to_datetime(Fraud['signup_time'], format='%Y-%m-%d %H:%M:%S')
Fraud['purchase_time'] =  pd.to_datetime(Fraud['purchase_time'], format='%Y-%m-%d %H:%M:%S')

# calculate time differences between sign up and purchase
Fraud['time_diffrence'] = (Fraud['purchase_time'] - Fraud['signup_time']).astype('timedelta64[h]')

In [13]:
# check for each device id how many different users had it
Fraud[['device_id', 'user_id']].groupby('device_id').count().reset_index().sort_values(by=['user_id'], ascending=False).head(10)

Unnamed: 0,device_id,user_id
718,BWSMVSLCJXMCM,4
7639,UHCAPOHBEBXJW,4
1662,EIWWDKIBIWRWY,4
3917,KIPFSCNUGOLDP,4
3133,IGKYVZDBEGALB,4
1126,DABLZVBIMFUTE,4
909,CLOHCTJSXUXWA,4
2753,HGVNRPOHIBZLJ,3
1481,DWUZNKTPXIFZI,3
898,CJWVPNPXURMXB,3


In [14]:
# check for each ip address how many different users had it
Fraud[['ip_address', 'user_id']].groupby('ip_address').count().reset_index().sort_values(by=['user_id'], ascending=False).head(10)

Unnamed: 0,ip_address,user_id
1382,576060900.0,4
6300,2747355000.0,4
7872,3445652000.0,4
5436,2351656000.0,4
6720,2937899000.0,4
4144,1800550000.0,4
2330,1008028000.0,3
5146,2202518000.0,3
6426,2804811000.0,3
4671,2011989000.0,3


In [15]:
# day of the week
Fraud['signup_time_wd'] = Fraud['signup_time'].dt.weekday
Fraud['purchase_time_wd'] = Fraud['purchase_time'].dt.weekday

In [16]:
# week of the yr
Fraud['signup_time_wy'] = Fraud['signup_time'].dt.week
Fraud['purchase_time_wy'] = Fraud['purchase_time'].dt.week

In [19]:
# drop columns [0,1,2,4]
Fraud.drop(Fraud.columns[[0,1,2]], axis=1, inplace=True)
Fraud.drop(Fraud.columns[[1]], axis=1, inplace=True)

In [24]:
# Replace None value with value
Fraud['country'].fillna(value='Not Found', inplace = True)

In [36]:
#just keep the top 50 country, everything else is "other" ..... not sure what top 50 means
Top50Country = Fraud.groupby(['country']).size().sort_values(ascending=False).nlargest(50).index.values.tolist()
Fraud.loc[~Fraud['country'].isin(Top50Country), 'country'] = 'Other'

In [51]:
Fraud.dtypes

browser              object
age                   int64
ip_address          float64
class                 int64
country              object
time_diffrence      float64
signup_time_wd        int64
purchase_time_wd      int64
signup_time_wy        int64
purchase_time_wy      int64
dtype: object