# Xente Fraud Detection Challenge
<h2 style='bold'>Background:</h2>
<p style='font-size:20'>
Fraud detection is an important application of machine learning in the financial services sector. This solution will help Xente provide improved and safer service to its customers.The objective of this competition is to create a machine learning model to detect fraudulent transactions.
</p>

## Exploratory Data Analysis

In [0]:
'''Importing necessary libraries'''
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef
import scipy as sp
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix, f1_score

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#importing the data
path = "drive/My Drive/X_fraud_detection/data"
train = pd.read_csv(path + "/training.csv")
test = pd.read_csv(path + "/test.csv")
sub = pd.read_csv(path + "/sample_submission.csv")


In [0]:
#train set info
train.info()

In [0]:
#viewing a sample of the training data
train.head()

In [0]:
#test set info
test.info()

In [0]:
#previewing the test set
test.head()

In [0]:
#checking the shape of the train and test datasets
tr = train.shape
te = test.shape
print("train_set_shape is: {} and test_set_shape is: {}".format(tr,te))

In [0]:
train.drop_duplicates(keep="first", inplace=True) #removing duplicate data if any
test.drop_duplicates(keep="first", inplace=True)

del tr
del te

In [0]:
#Count of fraudulent and non-fraudulent cases
print("Fraudulent: {}".format((train["FraudResult"]==1).sum()))
print("Non-Fraudulent: {}".format((train["FraudResult"]==0).sum()))

In [0]:
#Visualising Fraudulent and non-fraudulent cases
plt.bar("Fraudulent", train["FraudResult"].value_counts()[1], color="red")
plt.bar("Non_Fraudulent", train["FraudResult"].value_counts()[0], width=0.5, color="green")
plt.ylabel("Count", fontsize=14)
plt.title("Fraudulent VS Non-Fraudulent")

In [0]:
#label encoding columns
columns = train.columns.tolist()[1:11]
test_columns = test.columns.tolist()[1:11]

le = LabelEncoder()
for each in columns:
    train[each] = le.fit_transform(train[each])

for column in test_columns:
  test[column] = le.fit_transform(test[column])

In [0]:
train.head()

In [0]:
len_train = len(train)
new_df = pd.concat([train, test], sort=False)

#getting categorical dummies
categorical_columns = ["ProviderId", "ProductCategory", "ProductId", "ChannelId"]
new_df = pd.get_dummies(new_df, columns=categorical_columns)
new_df.head()

## Feature engineering(1)

In [0]:
#new_df

In [0]:
train = new_df[:len_train]
test = new_df[len_train:].reset_index(drop=True)

In [0]:
test.drop("FraudResult", axis=1, inplace=True)

In [0]:
#Extracting time and day from the TransactionStartTime column to create new features
train['hour'] = pd.to_datetime(train.TransactionStartTime).dt.hour
train['minute'] = pd.to_datetime(train.TransactionStartTime).dt.minute
train['day'] = pd.to_datetime(train.TransactionStartTime).dt.dayofweek

test['hour'] = pd.to_datetime(test.TransactionStartTime).dt.hour
test['minute'] = pd.to_datetime(test.TransactionStartTime).dt.minute
test['day'] = pd.to_datetime(test.TransactionStartTime).dt.dayofweek

In [0]:
# dropping the transaction starttime column
train = train.drop(["TransactionStartTime"], axis=1)
test = test.drop(["TransactionStartTime"], axis=1)

In [0]:
correlations = train.corr()
fig = plt.figure(figsize = (9, 6))

sns.heatmap(correlations, vmax = .8, square = True)
plt.show()

In [0]:
# Visualizing correlations of the various features to fraud_result
(correlations
     .FraudResult
     .drop("FraudResult") # can't compare the variable under study to itself
     .sort_values(ascending=False)
     .plot
     .barh(figsize=(9,7)))
plt.title("correlation bar_hist")

## Feature Engineering

In [0]:
train["period"] = np.nan
test["period"] = np.nan

In [0]:
train.loc[train.hour < 7, "period"] = 'em'
train.loc[(train.hour >= 7) & (train.hour < 11), "period"] = 'am'
train.loc[(train.hour >= 11) & (train.hour < 15), "period"] = 'mid'
train.loc[(train.hour >= 15) & (train.hour < 19), "period"] = 'eve'
train.loc[(train.hour >= 19) & (train.hour <= 24), "period"] = 'pm'

test.loc[test.hour < 7, "period"] = 'em'
test.loc[(test.hour >= 7) & (test.hour < 11), "period"] = 'am'
test.loc[(test.hour >= 11) & (test.hour < 15), "period"] = 'mid'
test.loc[(test.hour >= 15) & (test.hour < 19), "period"] = 'eve'
test.loc[(test.hour >= 19) & (test.hour <= 24), "period"] = 'pm'

In [0]:
# Encoding the period feature columns
le = LabelEncoder()
train["period"] = le.fit_transform(train["period"])
test["period"] = le.fit_transform(test["period"])

In [0]:
# Dropping columns with negative correlation to the fraudresult
columns_to_drop = ["SubscriptionId"]

train = train.drop(columns_to_drop, axis=1)
test = test.drop(columns_to_drop, axis=1)

In [0]:
new_correlations = train.corr()

# Visualizing correlations of new features to fraud_result
(new_correlations
     .FraudResult
     .drop("FraudResult") # can't compare the variable under study to itself
     .sort_values(ascending=False)
     .plot
     .barh(figsize=(9,7)))
plt.title("correlation bar_hist")

In [0]:
# dropping non-predictor feature columns and the target(train-set only)
target = train["FraudResult"]
train = train.drop(["TransactionId", "FraudResult"], axis=1)
test_id = test["TransactionId"]
test = test.drop(["TransactionId"], axis=1)

In [0]:
#Normalizing Amount and value columns
train["Value"] = train["Value"].abs()
train["Amount"] = train["Amount"].abs()
test["Value"] = test["Value"].abs()
test["Amount"] = test["Amount"].abs()

In [0]:
columns = columns
X = train
y = target

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.25,
                                                  random_state=1)

#boxcox transform of value col.
X_train.loc[:,'Value'], maxlog, (min_ci, max_ci) = sp.stats.boxcox(X_train['Value'],
                                                                   alpha=0.01)

In [0]:
MCC_scorer = make_scorer(matthews_corrcoef)

In [0]:
# RandomForest
pipeline_rf = Pipeline([
    ('model', RandomForestClassifier(max_depth=9, n_jobs=-1, random_state=1))
])
param_grid_rf = {'model__n_estimators': [50, 75, 100, 150, 200, 250, 300]
                 }

grid_rf = GridSearchCV(estimator=pipeline_rf, param_grid=param_grid_rf,
                       scoring=MCC_scorer, n_jobs=-1, pre_dispatch='2*n_jobs',
                       cv=8, verbose=1, return_train_score=False)

grid_rf.fit(X_train, y_train)

In [0]:
# Xgboost
model = xgb.XGBClassifier()

max_depth = range(1, 11, 2)
random_state=[1]
subsample = [0.9]
base_score = [0.5]
n_estimators = [50, 100, 150, 200, 250]

param_grid = dict(max_depth=max_depth, n_estimators=n_estimators,
                  random_state=random_state, subsample=subsample,
                  base_score=base_score)
kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=1)

grid_xgb = GridSearchCV(model, param_grid, scoring=MCC_scorer, n_jobs=-1, 
                        pre_dispatch='2*n_jobs', cv=kfold, verbose=1)

grid_xgb.fit(X_train, y_train)

In [0]:
print("RandomForest: \n")
print(grid_rf.best_score_)
print(grid_rf.best_params_)
print("\n")
print("Xgboost: \n")
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

In [0]:
y_pred = grid_rf.predict(X_val)
print(f'f1_score: {f1_score(y_val, y_pred)}\n')
print(confusion_matrix(y_val, y_pred))
print("\n")

val_pred = grid_xgb.predict(X_val)
print(f'f1_score: {f1_score(y_val, val_pred)}\n')
confusion_matrix(y_val, val_pred)

In [0]:
print("Predicting...")
predictions = grid_xgb.predict(test)
submission = pd.DataFrame({'TransactionId': test_id, 'FraudResult': predictions})
(submission['FraudResult']==1).value_counts()

In [0]:
#making a submission file
submission.to_csv('sub.csv',index=False) #currently waiting to test this submission
print("done...")

In [0]:
#LBscore 0.76363 currently with xgboost. (only dropping "SubscriptionId"/no new features)