# Preprocessing

## Import des données

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df1 = pd.read_csv("s3://full-stack-bigdata-datasets/Machine Learning Supervisé/projects/fraudulent_activity/Fraud_Data.csv")
ip_country = pd.read_csv("s3://full-stack-bigdata-datasets/Machine Learning Supervisé/projects/fraudulent_activity/IpAddress_to_Country.csv")

## Create useful features

In [3]:
def transform_ip(arg) :
    try :
        return ip_country.country[(ip_country.lower_bound_ip_address < arg) & (ip_country.upper_bound_ip_address > arg)].iloc[0]
    except IndexError :
        return "Pays inconnu"  
      

df1["country_name"] = df1.ip_address.apply(transform_ip)

In [4]:
df1.shape

(151112, 12)

In [5]:
df1.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country_name
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Pays inconnu
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


In [6]:
df1 = df1.iloc[:, 1:]
df1.head()

Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country_name
0,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Pays inconnu
4,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


## Date/Time

In [7]:
df1.signup_time = pd.to_datetime(df1.signup_time, format ="%Y-%m-%d %H:%M:%S")
df1.purchase_time = pd.to_datetime(df1.purchase_time, format ="%Y-%m-%d %H:%M:%S")
df1["time_delta"] = (df1.purchase_time - df1.signup_time)/np.timedelta64(1, 's')

In [8]:
df1.columns

Index(['signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source',
       'browser', 'sex', 'age', 'ip_address', 'class', 'country_name',
       'time_delta'],
      dtype='object')

In [9]:
df1 = df1.rename(columns={"class": "y", "x": "country_name"})
df1.head()

Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,y,country_name,time_delta
0,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0
1,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0
2,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,1.0
3,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Pays inconnu,492085.0
4,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0


In [10]:
df1['year'] = pd.DatetimeIndex(df1['purchase_time']).year
df1['month'] = pd.DatetimeIndex(df1['purchase_time']).month

df1.head()

Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,y,country_name,time_delta,year,month
0,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,2015,4
1,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,2015,6
2,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,1.0,2015,1
3,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Pays inconnu,492085.0,2015,5
4,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,2015,9


## Machine learning

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [16]:
print(df1.shape)
df1.describe(include='all')

(151112, 14)


  df1.describe(include='all')
  df1.describe(include='all')


Unnamed: 0,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,y,country_name,time_delta,year,month
count,151112,151112,151112.0,151112,151112,151112,151112,151112.0,151112.0,151112.0,151112,151112.0,151112.0,151112.0
unique,151112,150679,,137956,3,5,2,,,,182,,,
top,2015-06-30 03:58:38,2015-07-17 23:22:55,,ITUMJCKWEYNDD,SEO,Chrome,M,,,,United States,,,
freq,1,3,,20,60615,61432,88293,,,,58049,,,
first,2015-01-01 00:00:42,2015-01-01 00:00:44,,,,,,,,,,,,
last,2015-08-18 04:40:29,2015-12-16 02:56:05,,,,,,,,,,,,
mean,,,36.935372,,,,,33.140704,2152145000.0,0.093646,,4932029.0,2015.0,6.008629
std,,,18.322762,,,,,8.617733,1248497000.0,0.291336,,3126263.0,0.0,2.660637
min,,,9.0,,,,,18.0,52093.5,0.0,,1.0,2015.0,1.0
25%,,,22.0,,,,,27.0,1085934000.0,0.0,,2186754.0,2015.0,4.0


In [18]:
# Drop columns with too many unique values
df1 = df1.drop(['signup_time', 'purchase_time', 'device_id'], axis=1)

# Drop ip address (not useful as raw number, we will rather use the country_name deduced from the ip)
df1 = df1.drop(['ip_address'], axis=1)

# Drop year (because it's always 2015)
df1 = df1.drop(['year'], axis=1)

df1.head()

Unnamed: 0,purchase_value,source,browser,sex,age,y,country_name,time_delta,month
0,34,SEO,Chrome,M,39,0,Japan,4506682.0,4
1,16,Ads,Chrome,F,53,0,United States,17944.0,6
2,15,SEO,Opera,M,53,1,United States,1.0,1
3,44,SEO,Safari,M,41,0,Pays inconnu,492085.0,5
4,39,Ads,Safari,M,45,0,United States,4361461.0,9


In [29]:
df1['country_name'].value_counts()

United States     58049
Pays inconnu      21966
China             12038
Japan              7306
United Kingdom     4490
                  ...  
Cape Verde            1
Madagascar            1
Nauru                 1
Yemen                 1
Tajikistan            1
Name: country_name, Length: 182, dtype: int64

In [32]:
country_counts = df1['country_name'].value_counts()
print(country_counts[country_counts > 50].index.values)

['United States' 'Pays inconnu' 'China' 'Japan' 'United Kingdom'
 'Korea Republic of' 'Germany' 'France' 'Canada' 'Brazil' 'Italy'
 'Australia' 'Netherlands' 'Russian Federation' 'India'
 'Taiwan; Republic of China (ROC)' 'Mexico' 'Sweden' 'Spain'
 'South Africa' 'Switzerland' 'Poland' 'Argentina' 'Indonesia' 'Norway'
 'Colombia' 'Turkey' 'Viet Nam' 'Romania' 'Denmark' 'Hong Kong' 'Finland'
 'Austria' 'Ukraine' 'Chile' 'Belgium' 'Iran (ISLAMIC Republic Of)'
 'Egypt' 'Czech Republic' 'Thailand' 'New Zealand' 'Israel' 'Saudi Arabia'
 'Venezuela' 'Ireland' 'European Union' 'Greece' 'Portugal' 'Hungary'
 'Malaysia' 'Singapore' 'Pakistan' 'Philippines' 'Bulgaria' 'Morocco'
 'Algeria' 'Peru' 'Tunisia' 'United Arab Emirates' 'Ecuador' 'Lithuania'
 'Seychelles' 'Kenya' 'Kazakhstan' 'Costa Rica' 'Kuwait' 'Slovenia'
 'Slovakia (SLOVAK Republic)' 'Uruguay' 'Croatia (LOCAL Name: Hrvatska)'
 'Luxembourg' 'Belarus' 'Serbia' 'Nigeria' 'Latvia' 'Panama' 'Bolivia'
 'Dominican Republic']


In [34]:
# Drop lines with rare values of country_name (< 50 occurences)
country_counts = df1['country_name'].value_counts()
to_keep = country_counts[country_counts > 50].index.values
df1 = df1.loc[df1['country_name'].isin(to_keep),:]

In [35]:
# Separate target variable Y from features X
target_name = 'y'

print("Separating labels from features...")
Y = df1.loc[:,target_name]
X = df1.loc[:,[c for c in df1.columns if c!=target_name]] 
print("...Done.")
print(Y.head())
print()
print(X.head())
print()



Separating labels from features...
...Done.
0    0
1    0
2    1
3    0
4    0
Name: y, dtype: int64

   purchase_value source browser sex  age   country_name  time_delta  month
0              34    SEO  Chrome   M   39          Japan   4506682.0      4
1              16    Ads  Chrome   F   53  United States     17944.0      6
2              15    SEO   Opera   M   53  United States         1.0      1
3              44    SEO  Safari   M   41   Pays inconnu    492085.0      5
4              39    Ads  Safari   M   45  United States   4361461.0      9



In [36]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X = X.values
Y = Y.tolist()
print("...Done")
print(X[0:5,:])
print()
print(Y[0:5])


Convert pandas DataFrames to numpy arrays...
...Done
[[34 'SEO' 'Chrome' 'M' 39 'Japan' 4506682.0 4]
 [16 'Ads' 'Chrome' 'F' 53 'United States' 17944.0 6]
 [15 'SEO' 'Opera' 'M' 53 'United States' 1.0 1]
 [44 'SEO' 'Safari' 'M' 41 'Pays inconnu' 492085.0 5]
 [39 'Ads' 'Safari' 'M' 45 'United States' 4361461.0 9]]

[0, 0, 1, 0, 0]


In [37]:
# Divide dataset into train set & test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y) # stratified splitting because we have an imbalanced dataset !!
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [38]:
# Create pipeline for numeric features
numeric_features = [0, 4, 6, 7] # Positions of numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = [1, 2, 3, 5] # Positions of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train[0:5,:])
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5,:])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test[0:5,:])
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
print('...Done.')
print(X_test[0:5,:])
print()



Performing preprocessings on train set...
[[29 'SEO' 'FireFox' 'F' 24 'Japan' 399081.0 5]
 [19 'SEO' 'IE' 'M' 30 'United States' 6545091.0 10]
 [52 'Direct' 'IE' 'M' 37 'Pays inconnu' 1.0 1]
 [44 'Direct' 'Chrome' 'F' 31 'United States' 261978.0 5]
 [16 'Direct' 'Chrome' 'F' 42 'United States' 2949385.0 6]]
...Done.
  (0, 0)	-0.43296254252665767
  (0, 1)	-1.0591256639796414
  (0, 2)	-1.4478388621940075
  (0, 3)	-0.3783695098615592
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 43)	1.0
  (1, 0)	-0.9785247924444799
  (1, 1)	-0.36227925966760166
  (1, 2)	0.5175229812597267
  (1, 3)	1.5001657574425975
  (1, 5)	1.0
  (1, 7)	1.0
  (1, 10)	1.0
  (1, 84)	1.0
  (2, 0)	0.8218306322843335
  (2, 1)	0.450708212029778
  (2, 2)	-1.5754560649784994
  (2, 3)	-1.8811977237048847
  (2, 4)	1.0
  (2, 7)	1.0
  (2, 10)	1.0
  (2, 60)	1.0
  (3, 0)	0.3853808323500757
  (3, 1)	-0.24613819228226172
  (3, 2)	-1.491681453537503
  (3, 3)	-0.3783695098615592
  (3, 4)	1.0
  (3, 84)	1.0
  (4, 0)	-1.1421934674198266
  (4, 1)	1.0314135

In [42]:
# Train baseline model
model = LogisticRegression(class_weight = 'balanced', max_iter = 1000) # change value of max_iter to avoid solver warning

print("Training model...")
model.fit(X_train, Y_train) # Training is always done on train set !!
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

# Print scores
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))



Training model...
...Done.
Predictions on training set...
...Done.
[1 0 1 1 0]

Predictions on test set...
...Done.
[0 0 1 0 0]

f1-score on training set :  0.3108795485903746
f1-score on test set :  0.3051717004551096


In [48]:
# Train Random Forest with grid search
rf = RandomForestClassifier()

params = {
    'n_estimators': [100, 120, 140, 160, 180],
    'max_depth': [4, 6, 8]
}

model = GridSearchCV(rf, param_grid = params, verbose=2)

print("Training model...")
model.fit(X_train, Y_train) # Training is always done on train set !!
print("...Done.")

print("Best hyperparameters : ")
print(model.best_params_)

# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

# Print scores
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))


Training model...
Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] max_depth=4, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................... max_depth=4, n_estimators=100, total=   3.1s
[CV] max_depth=4, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


[CV] .................... max_depth=4, n_estimators=100, total=   3.2s
[CV] max_depth=4, n_estimators=100 ...................................
[CV] .................... max_depth=4, n_estimators=100, total=   3.2s
[CV] max_depth=4, n_estimators=100 ...................................
[CV] .................... max_depth=4, n_estimators=100, total=   2.9s
[CV] max_depth=4, n_estimators=100 ...................................
[CV] .................... max_depth=4, n_estimators=100, total=   2.8s
[CV] max_depth=4, n_estimators=120 ...................................
[CV] .................... max_depth=4, n_estimators=120, total=   3.5s
[CV] max_depth=4, n_estimators=120 ...................................
[CV] .................... max_depth=4, n_estimators=120, total=   3.5s
[CV] max_depth=4, n_estimators=120 ...................................
[CV] .................... max_depth=4, n_estimators=120, total=   3.6s
[CV] max_depth=4, n_estimators=120 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  7.1min finished


...Done.
Best hyperparameters : 
{'max_depth': 4, 'n_estimators': 120}
Predictions on training set...
...Done.
[0 0 1 0 0]

Predictions on test set...
...Done.
[0 0 0 0 0]

f1-score on training set :  0.6991973205520587
f1-score on test set :  0.7023041474654378


**The f1-score has been multiplied by a factor 2 by introducing non-linearities in the model ! Combined with the grid_search, we converged to a model with quite good performances and no overfitting. Let's have a look to the confusion matrix and other classification scores :**

In [51]:
confusion_matrix?

[0;31mSignature:[0m
[0mconfusion_matrix[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0my_true[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_pred[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnormalize[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification.

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` and
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read 

In [50]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

print('--- Train set ---')
print('Precision : ', precision_score(Y_train, Y_train_pred))
print('Recall : ', recall_score(Y_train, Y_train_pred))
print()
print(confusion_matrix(Y_train, Y_train_pred))
print()
print('--- Test set ---')
print('Precision : ', precision_score(Y_test, Y_test_pred))
print('Recall : ', recall_score(Y_test, Y_test_pred))
print()
print(confusion_matrix(Y_test, Y_test_pred))
print()

--- Train set ---
Precision :  1.0
Recall :  0.5375122081150671

[[108683      0]
 [  5209   6054]]

--- Test set ---
Precision :  1.0
Recall :  0.5411931818181818

[[27171     0]
 [ 1292  1524]]



#### Interpreting the scores

* The precision represents the "purity" of the predictions "1". In other words, it's the ratio of examples being predicted as 1s that are indeed true 1s : $P = \frac{TP}{TP + FP}$ 
* The recall represents the ability to detect the true "1". It's the ratio of examples that are true 1s and have been predicted as 1s : $R = \frac{TP}{TP + FN}$ 
* From the formulae above, one can see the the precision is maximal when there are no false positives, and the recall is maximal when there are no false negatives

#### What does that mean in the context of fraud detection ?

* False positives represent transactions that are reported as frauds but in fact aren't
* False negatives represent frauds that are not detected (this is the kind of error that we want to avoid)
* In this case, we would like to detect as much frauds as possible while having a number of false positives that is as close as possible to 0 (because we want to avoid fake alerts that would be time-consuming for the company)

#### What is our model doing here ?
* Our model's precision is perfect (P=1) which means there are no false positives
* Our model's recall is lower (R=0.5) which means that we detect only 50% of the frauds

#### Is it satisfying ?
Yes, with this model we would detect 50% of the frauds without making any false alert (which is way better than detecting no frauds at all)

#### Next steps
To explain the model to your boss, you can :
* Plot the feature importances in your model
* Make some viz of the different features X and see if it changes whether Y=0 or Y=1