# <a id="2">Load packages</a>

In [None]:
import pandas as pd 
import pandas_profiling
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

sns.set(style='white', context='notebook', palette='deep')


import gc
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn import svm
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import classification_report


pd.set_option('display.max_columns', 100)


RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier


#TRAIN/VALIDATION/TEST SPLIT
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation



RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

IS_LOCAL = False

import os

PATH="data"
    
print(os.listdir(PATH))

# <a id="3">Read the data</a>

In [None]:
data_df = pd.read_csv(PATH+"/training.csv")

In [None]:
test_df = pd.read_csv(PATH+"/test.csv")

# <a id="4">Check the data</a>

In [None]:
print("Training data -  rows:",data_df.shape[0]," columns:", data_df.shape[1])
print("Test data -  rows:",data_df.shape[0]," columns:", test_df.shape[1])

## <a id="41">Glimpse the data</a>

We start by looking to the data features (first 5 rows).

In [None]:
data_df.head()

Let's look into more details to the data.

In [None]:
#data_df.describe()

In [None]:
#pandas_profiling.ProfileReport(data_df)

In [None]:
#info
data_df.info()

In [None]:
data_df["TransactionStartTime"] = pd.to_datetime(data_df["TransactionStartTime"],infer_datetime_format=True)

In [None]:
data_df['TransactionStartTime'].iloc[0]

In [None]:
data_df_Timedelta = data_df['TransactionStartTime'].iloc[95661] - data_df['TransactionStartTime'].iloc[0]
data_df_Timedelta

Looking to the **Time** feature, we can confirm that the data contains **95,662** transactions, during 90 consecutive days.

In [None]:
data_df.head()

## <a id="42">Check missing data</a>  

Let's check if there is any missing data.

In [None]:
total = data_df.isnull().sum().sort_values(ascending = False)
percent = (data_df.isnull().sum()/data_df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

There is no missing data in the entire dataset.

Only **193** (or **0.02%**) of transaction are fraudulent. That means the data is highly unbalanced with respect with target variable **Class**.

# Preprocess Training Data

### Extracting just Date, Month and Year from TransactionStartTime column

In [None]:
data_df.head()

In [None]:
data_df['year'] = pd.DatetimeIndex(data_df['TransactionStartTime']).year
data_df['month'] = pd.DatetimeIndex(data_df['TransactionStartTime']).month
data_df['day'] = pd.DatetimeIndex(data_df['TransactionStartTime']).day
data_df['hour'] = pd.DatetimeIndex(data_df['TransactionStartTime']).hour
data_df['minute'] = pd.DatetimeIndex(data_df['TransactionStartTime']).minute
data_df['second'] = pd.DatetimeIndex(data_df['TransactionStartTime']).second

In [None]:
#data_df["TransactionStartTime"] = pd.to_datetime(data_df["TransactionStartTime"],infer_datetime_format=True)

#change the full date to day of week
data_df["day_of_week"] = data_df["TransactionStartTime"].dt.dayofweek 

In [None]:
data_df.head()

### Split columns using delimiter

In [None]:
data_df[['TransactionId_split','TransactionId']] = data_df.TransactionId.str.split("_",expand=True,)
data_df[['BatchId_split','BatchId']] = data_df.BatchId.str.split("_",expand=True,)
data_df[['AccountId_split','AccountId']] = data_df.AccountId.str.split("_",expand=True,)
data_df[['SubscriptionId_split','SubscriptionId']] = data_df.SubscriptionId.str.split("_",expand=True,)
data_df[['CustomerId_split','CustomerId']] = data_df.CustomerId.str.split("_",expand=True,)
data_df[['ProviderId_split','ProviderId']] = data_df.ProviderId.str.split("_",expand=True,)
data_df[['ProductId_split','ProductId']] = data_df.ProductId.str.split("_",expand=True,)
data_df[['ChannelId_split','ChannelId']] = data_df.ChannelId.str.split("_",expand=True,)

In [None]:
data_df.head()

### Drop Columns

In [None]:
data_df = data_df.drop(['TransactionId_split','BatchId_split','AccountId_split','SubscriptionId_split',\
                        'CustomerId_split','ProviderId_split','ProductId_split','ChannelId_split','TransactionStartTime',\
                        'CurrencyCode','CountryCode'], axis=1)

In [None]:
data_df.head()

## To Categorical

In [None]:
data_df["ProductCategory"] = pd.Categorical(data_df["ProductCategory"])
ProductCategory_categories = data_df.ProductCategory.cat.categories
data_df["ProductCategory"] = data_df.ProductCategory.cat.codes

In [None]:
data_df.head()

# Process Test Data

In [None]:
test_df["TransactionStartTime"] = pd.to_datetime(test_df["TransactionStartTime"],infer_datetime_format=True)
test_df['TransactionStartTime'].iloc[0]
test_df_Timedelta = test_df['TransactionStartTime'].iloc[45018] - test_df['TransactionStartTime'].iloc[0]
test_df_Timedelta

# Preprocess Data

### Extracting just Date, Month and Year from TransactionStartTime column

test_df.head()

test_df['year'] = pd.DatetimeIndex(test_df['TransactionStartTime']).year
test_df['month'] = pd.DatetimeIndex(test_df['TransactionStartTime']).month
test_df['day'] = pd.DatetimeIndex(test_df['TransactionStartTime']).day
test_df['hour'] = pd.DatetimeIndex(test_df['TransactionStartTime']).hour
test_df['minute'] = pd.DatetimeIndex(test_df['TransactionStartTime']).minute
test_df['second'] = pd.DatetimeIndex(test_df['TransactionStartTime']).second

#test_df["TransactionStartTime"] = pd.to_datetime(test_df["TransactionStartTime"],infer_datetime_format=True)

#change the full date to day of week
test_df["day_of_week"] = test_df["TransactionStartTime"].dt.dayofweek 

test_df.head()

### Split columns using delimiter

test_df[['TransactionId_split','TransactionId']] = test_df.TransactionId.str.split("_",expand=True,)
test_df[['BatchId_split','BatchId']] = test_df.BatchId.str.split("_",expand=True,)
test_df[['AccountId_split','AccountId']] = test_df.AccountId.str.split("_",expand=True,)
test_df[['SubscriptionId_split','SubscriptionId']] = test_df.SubscriptionId.str.split("_",expand=True,)
test_df[['CustomerId_split','CustomerId']] = test_df.CustomerId.str.split("_",expand=True,)
test_df[['ProviderId_split','ProviderId']] = test_df.ProviderId.str.split("_",expand=True,)
test_df[['ProductId_split','ProductId']] = test_df.ProductId.str.split("_",expand=True,)
test_df[['ChannelId_split','ChannelId']] = test_df.ChannelId.str.split("_",expand=True,)

test_df.head()

### Drop Columns

test_df = test_df.drop(['TransactionId_split','BatchId_split','AccountId_split','SubscriptionId_split',\
                        'CustomerId_split','ProviderId_split','ProductId_split','ChannelId_split','TransactionStartTime',\
                        'CurrencyCode','CountryCode'], axis=1)

test_df.head()

## To Categorical

test_df["ProductCategory"] = pd.Categorical(test_df["ProductCategory"])
ProductCategory_categories = test_df.ProductCategory.cat.categories
test_df["ProductCategory"] = test_df.ProductCategory.cat.codes

# <a id="6">Predictive models</a>  



In [None]:
data_df.head()

In [None]:
test_df.head()

# Predictive models

In [None]:
y_train = data_df['FraudResult']
x_train = data_df.drop(labels=['FraudResult'], axis=1)

y_test = test_df['FraudResult']
x_test = test_df.drop(labels=['FraudResult'], axis=1)

## Cross Validating

In [None]:
rs= 2
kfold=StratifiedKFold(n_splits=5, random_state=rs, shuffle=True)
classifiers = []
classifiers.append(SVC(random_state = rs))
classifiers.append(RandomForestClassifier(random_state=rs))
classifiers.append(ExtraTreesClassifier(random_state=rs))
classifiers.append(GradientBoostingClassifier(random_state=rs))
classifiers.append(xgb.XGBClassifier(seed=rs))
classifiers.append(KNeighborsClassifier())

cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y=y_train, scoring='roc_auc', cv=kfold))

cv_means = []
cv_std = []

for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
cv_res = pd.DataFrame({"CrossValsMeans": cv_means, "CrossValerrors": cv_std, 
            "Alg": ["SVC", "RF", "ETC", "GB", "XGB", "KNN"]})
cv_res.sort_values("CrossValsMeans", ascending=False)


## Making Predictions

In [None]:
model= xgb.XGBClassifier(seed=rs)

In [None]:
model.fit(x_train, y_train)

In [None]:
pred = model.predict(x_test)

In [None]:
subs = pd.DataFrame({"TransactionId": test_df['TransactionId'], "FraudResult": pred})
subs = subs[['TransactionId', 'FraudResult']]
subs.to_csv("final.csv")