# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import pickle

# Data Analysis

In [None]:
# Load the train data
train_transaction = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")
train_id = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")

In [None]:
# Print the shape and sizee of data
print("Number of rows in train_transaction data {:,} and number of columns in train_transaction data {:,} ".format(train_transaction.shape[0], train_transaction.shape[1]))
print("Number of rows in train_id data {:,} and number of columns in train_id data {:,} ".format(train_id.shape[0], train_id.shape[1]))


In [None]:
train_id.head()

In [None]:
# Check for NaN values
train_id.isnull().sum()

In [None]:
# Print %age of null values
total_rows = train_id.shape[0]
columns_to_drop = []
for cols in train_id.columns:
  # check null values
  null_count = train_id[cols].isnull().sum()
  if null_count > 0:
    perc_null_values = float(null_count)*100/total_rows
    print("{} column has {:.3f}% null values".format(cols, perc_null_values))

    if perc_null_values > 50:
      columns_to_drop.append(cols)

In [None]:
columns_to_drop

In [None]:
# drop the columns to drop
train_id.drop(columns_to_drop, axis=1, inplace=True)

**Almost every column has NaN values.**

In [None]:
# fill  the rest of nan with default values
train_id.fillna(-999, inplace=True)

In [None]:
train_id.info()

In [None]:
# label encode the categorical columns
cat_cols = [cols for cols in train_id.columns if train_id[cols].dtype == 'object']

# Label Encoder object
le = LabelEncoder()
for col in cat_cols:
  train_id[col] = train_id[col].astype('str')
  train_id[col] = le.fit_transform(train_id[col])

In [None]:
train_id.shape

In [None]:
# print percentage of null values in the transaction data
# Print %age of null values
total_rows = train_transaction.shape[0]
columns_to_drop = []
for cols in train_transaction.columns:
  # check null values
  null_count = train_transaction[cols].isnull().sum()
  if null_count > 0:
    perc_null_values = float(null_count)*100/total_rows
    print("{} column has {:.3f}% null values".format(cols, perc_null_values))

    if perc_null_values > 50:
      columns_to_drop.append(cols)

In [None]:
len(columns_to_drop)

In [None]:
# drop the columns to drop
train_transaction.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
# Substitute default values in the rest of the null values
train_transaction.fillna(-999, inplace=True)

In [None]:
# Encode the categorical cols
cat_cols2 = [cols for cols in train_transaction.columns if train_transaction[cols].dtype == 'object']
for cols in cat_cols2:
  le = LabelEncoder()
  train_transaction[cols] = train_transaction[cols].astype('str')
  train_transaction[cols] = le.fit_transform(train_transaction[cols])

In [None]:
train_transaction.isnull().sum()

In [None]:
train_transaction.head()

In [None]:
train_id.head()

In [None]:
# merge both the dataset
final_data = pd.merge(train_id, train_transaction, on='TransactionID', how='inner')

In [None]:
final_data.head()

# XGBoost Model

In [None]:
# split the dataset into train features and target varaible
X = final_data.drop('isFraud', axis=1)
y = final_data['isFraud']

In [None]:
X.head()

In [None]:
# split the tdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

In [None]:
# Print the accuracy score
print(xgb_clf.score(X_test, y_test))

In [None]:
# Compute ROC AUC Score
prediction_probability = xgb_clf.predict_proba(X_test)
print(roc_auc_score(y_test, prediction_probability[ : , 1]))

In [None]:
# Compute ROC AUC Score
prediction = xgb_clf.predict(X_test)
print(f1_score(y_test, prediction))

In [None]:
# print the feature importance
features = X.columns
feature_imp = xgb_clf.feature_importances_

feat_imp_df = pd.DataFrame({'Features' : features, 'Feature_Importance' : feature_imp}).sort_values(by='Feature_Importance', ascending=False)

# Plot the feature_importance : only top 10
plt.figure(figsize=(12,10))
plt.barh(y=feat_imp_df['Features'].iloc[ : 10], width=feat_imp_df['Feature_Importance'].iloc[ : 10])
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.show()

# PCA

**Reduce the dimensionality using PCA**

In [None]:
pca = PCA(n_components=0.90)
X_red = pca.fit_transform(X)

In [None]:
imp_col_no_after_pca = np.argmax(pca.components_)
imp_col_after_pca = X.columns[imp_col_no_after_pca]

print("The feature with most variance after PCA {}".format(imp_col_after_pca))

# Train XGBoost Model on the lower dimension dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_red, y, test_size=0.2, random_state=0)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
xgb_clf.score(X_test, y_test)

In [None]:
# Compute ROC AUC Score
prediction_probability = xgb_clf.predict_proba(X_test)
print(roc_auc_score(y_test, prediction_probability[ : , 1]))

**ROC_AUC Score has reduced significantly after reducing the dimesionality of the dataset.**