In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_validate
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,VotingClassifier
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression,LinearRegression, Ridge, Lasso, ElasticNet
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

In [None]:
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")

test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [None]:
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))

# EDA

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())

In [None]:
check_df(train_df)

In [None]:
check_df(test_df)

In [None]:
# source: https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-592110
# Categorical Features:
# ProductCD
# card1 - card6
# addr1, addr2
# P_emaildomain
# R_emaildomain
# M1 - M9
# DeviceType
# DeviceInfo
# id_12 - id_38

In [None]:
cat_cols = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain",
            "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "DeviceType", "DeviceInfo", "id_12", "id_13", "id_14", "id_15",
            "id_16", "id_17", "id_18", "id_19", "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27", "id_28",
            "id_29", "id_30", "id_31", "id_32", "id_33", "id_34", "id_35", "id_36", "id_37", "id_38"]

In [None]:
num_cols = [col for col in train_df.columns if col not in cat_cols and col not in ["TransactionID", "isFraud"]]

# Analysis of Categorical Variables 

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("\n")

    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

In [None]:
for col in cat_cols:
    cat_summary(train_df, col)

# Analysis of Numerical Variables 

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()

    print("#####################################")
    print("\n")

In [None]:
for col in num_cols:
    num_summary(train_df, col)

# Class Number of Numerical Variables

In [None]:
for col in num_cols: 
    print(f"{col}: {len(train_df[col].value_counts())}")

# Class Number of Categorical Variables

In [None]:
for col in cat_cols: 
    print(f"{col}: {len(train_df[col].value_counts())}")

# Analysis of Categorical Variables According to Target Variable

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

In [None]:
for col in cat_cols:
    target_summary_with_cat(train_df,"isFraud",col)

# Analysis of Numerical Variables According to Target Variable

In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n")
    print("#################################", end="\n\n")

In [None]:
for col in num_cols:
    target_summary_with_num(train_df, "isFraud", col)

# Missing Values

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    
    if na_name:
        return na_columns,missing_df

In [None]:
na_cols,missing_df=missing_values_table(train_df, True)
missing_df.reset_index(inplace=True)

# Outliers

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.01, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
    print(col, check_outlier(train_df, col))

# Some Important Graphs

## Missing values graph

In [None]:
null_variables = train_df.isnull().sum()/len(train_df) * 100
null_variables = null_variables.drop(null_variables[null_variables == 0].index).sort_values(ascending=False)[:500]
null_variables

In [None]:
plt.subplots(figsize=(40,10))
plt.xticks(rotation='90')
sns.barplot(null_variables.index, null_variables)
plt.xlabel('Features', fontsize=20)
plt.ylabel('Missing rate', fontsize=20);

## Fraud Graph

In [None]:
#first on given fraud value on train data
plt.subplots(figsize=(10,5))
sns.countplot(train_df['isFraud'], palette=["#FFD500", "#005BBB"])
plt.show()
print('From total data ',np.round(train_df[train_df['isFraud']==1].shape[0]/train_df.shape[0]*100,2),'% contains fraud train')
print('From total data ',np.round(train_df[train_df['isFraud']==0].shape[0]/train_df.shape[0]*100,2),'% contains legit train')

## TransactionDT

In [None]:
plt.subplots(figsize=(15,5))
plt.hist(train_df['TransactionDT'], label='train', bins=50, color="#005BBB");
plt.hist(test_df['TransactionDT'], label='test', bins=50, color="#FFD500");
plt.legend();
plt.title('Transaction dates');


In [None]:
print(f"Train TransactionDT min : {train_df.TransactionDT.min()}")
print(f"Train TransactionDT max : {train_df.TransactionDT.max()}")
print(f"Test TransactionDT min : {test_df.TransactionDT.min()}")
print(f"Test TransactionDT max : {test_df.TransactionDT.max()}")

#### if the transactionDT is in seconds then (TransactionDT/(60x60x24))

Time interval of the total dataset is 394.9993634259259 days

Time interval of Train dataset is  181.99920138888888 days

Time interval of Test dataset is  182.99908564814814 days

The gap between train and test is 30.00107638888889 days

In [None]:
train_df['hour'] = (train_df['TransactionDT']//(3600))%24
test_df['hour'] = (test_df['TransactionDT']//(3600))%24

train_hour = (train_df.groupby(['isFraud'])['hour']
                     .value_counts(normalize=True)
                     .rename('percentage')
                     .mul(100)
                     .reset_index()
                     .sort_values('hour'))

plt.subplots(figsize=(10,6))
sns.barplot(x="hour", y="percentage", hue="isFraud", data=train_hour, palette=["#FFD500", "#005BBB"]);

## TransactionAMT

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

time_val = train_df['TransactionAmt'].values

sns.distplot(time_val, ax=ax[0], color='#FFD500')
ax[0].set_title('Distribution of TransactionAmt', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax=ax[1], color='#005BBB')
ax[1].set_title('Distribution of LOG TransactionAmt', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()

## ProductCD

In [None]:
plt.figure(figsize=(12,6))

train_ProductCD = (train_df.groupby(['isFraud'])['ProductCD']
                     .value_counts(normalize=True)
                     .rename('percentage')
                     .mul(100)
                     .reset_index()
                     .sort_values('ProductCD'))
sns.barplot(x="ProductCD", y="percentage", hue="isFraud", data=train_ProductCD, palette=["#FFD500", "#005BBB"]);


## card3

In [None]:
plt.figure(figsize=(11,6))
plt.subplot(1,2,1)
sns.distplot(train_df[(train_df['isFraud']==0) & (~train_df['card3'].isnull())]['card3'], color="#FFD500")
sns.distplot(train_df[(train_df['isFraud']==1) & (~train_df['card3'].isnull())]['card3'], color="#005BBB")
plt.legend(['Legit','Fraud'])
plt.title('Train')
plt.subplot(1,2,2)
sns.distplot(test_df[~test_df['card3'].isnull()]['card3'], color="#005BBB")
plt.title('Test');

## card4

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
train_card4 = (train_df[~train_df['card4'].isnull()].groupby(['isFraud'])['card4']
                     .value_counts(normalize=True)
                     .rename('percentage')
                     .mul(100)
                     .reset_index()
                     .sort_values('card4'))
sns.barplot(x="card4", y="percentage", hue="isFraud", data=train_card4, palette=["#FFD500", "#005BBB"])
plt.title('Train')
plt.subplot(1,2,2)
test_card4 =test_df[~test_df['card4'].isnull()]['card4'].value_counts(normalize=True).mul(100).rename('percentage')\
.reset_index()
sns.barplot(x="index", y="percentage", data=test_card4, palette=["#FFD500", "#005BBB"])
plt.title('Test');

## D4 (for outliers)

In [None]:
plt.figure(figsize=(11,6))
plt.subplot(1,2,1)
sns.scatterplot(x="TransactionDT",y="D4",hue="isFraud",data=train_df[~train_df['D4'].isnull()],alpha=0.7,hue_order=[0,1] \
                ,palette=["#FFD500", "#005BBB"] )
plt.title('Train')
plt.subplot(1,2,2)
sns.scatterplot(x="TransactionDT",y="D4",data=test_df[~test_df['D4'].isnull()])
plt.title('Test');