In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import gc
import datetime
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_validate
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler, RobustScaler
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler

### Here we have disabled the warnings to avoid getting some warnings

In [None]:
warnings.simplefilter("ignore")

### Adjustments have been made for the columns and columns to be displayed

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Importing data from Dataset

In [None]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

### The data sets given separately were merged

In [None]:
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")

test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

### The name mismatch between the Train and test datasets has been fixed.

In [None]:
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))

# Reduce Memory

### Changing data types to change the memory space of the data set

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

# Filling Missing Values

In [None]:
train_df = train_df.fillna(-999)
test_df = test_df.fillna(-999)

# Label Encoding

### Using Label Encoder to make the data suitable for the machine learning model.

In [None]:
# Label Encoding
for f in train_df.columns:
    if train_df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values)) 

In [None]:
for f in test_df.columns:
    if test_df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(test_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

# Creating models

In [None]:
y = train_df["isFraud"]
X = train_df.drop(["isFraud", "TransactionID"], axis=1)
test_df = test_df.drop(["TransactionID"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

## Modeling LightGBM

In [None]:
model_lgb = LGBMClassifier()
model_lgb.fit(X_train,y_train) 

In [None]:
pred = model_lgb.predict(X_test)
print(classification_report(y_test, pred))
print(f"Auc: {round(roc_auc_score(pred, y_test), 2)}")

In [None]:
def plot_importance(model, features, num=len(X), save=False):

    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:50])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

In [None]:
plot_importance(model_lgb, X_train)

In [None]:
predictions = model_lgb.predict_proba(test_df)[:,1]

In [None]:
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

### Saving submission file

In [None]:
filename = 'CIS Fraud Detection LGBM.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.893605
# Public score: 0.918328

## Modeling XGBoost

In [None]:
model_xgb = XGBClassifier() 
model_xgb.fit(X_train,y_train) 

In [None]:
pred2 = model_xgb.predict(X_test)
print(classification_report(y_test, pred2))
print(f"Auc: {round(roc_auc_score(pred2, y_test), 2)}")

In [None]:
plot_importance(model_xgb, X_train)

In [None]:
predictions2 = model_xgb.predict_proba(test_df)[:,1]

In [None]:
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions2})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

### Saving submission file

In [None]:
filename = 'CIS Fraud Detection XGB.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.891545
# Public score: 0.916079

# Applying Random Undersampling

In [None]:
print("Before UnderSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UnderSampling, counts of label '0': {} \n".format(sum(y_train==0)))

undersample = RandomUnderSampler(sampling_strategy=0.2, random_state=3)

X_train_res, y_train_res = undersample.fit_resample(X, y)

print('After UnderSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After UnderSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After UnderSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UnderSampling, counts of label '0': {}".format(sum(y_train_res==0)))

## After Random Undersampling Modeling LightGBM

In [None]:
model_lgbm = LGBMClassifier()  
model_lgbm.fit(X_train_res,y_train_res) 

In [None]:
pred3 = model_lgbm.predict(X_test) 
print(classification_report(y_test, pred3))
print(f"Auc: {round(roc_auc_score(pred3, y_test), 2)}")

In [None]:
plot_importance(model_lgbm, X_train)

In [None]:
predictions3 = model_lgbm.predict_proba(test_df)[:,1]

In [None]:
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions3})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

### Saving submission file

In [None]:
filename = 'CIS Fraud Detection under sampling lgbm.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.901909
# Public score: 0.923298

## After Random Undersampling Modeling XGBoost

In [None]:
model_xgb = XGBClassifier()   
model_xgb.fit(X_train_res,y_train_res) 

In [None]:
pred4 = model_xgb.predict(X_test) 
print(classification_report(y_test, pred4))
print(f"Auc: {round(roc_auc_score(pred4, y_test), 2)}")

In [None]:
plot_importance(model_xgb, X_train)

In [None]:
predictions4 = model_xgb.predict_proba(test_df)[:,1]

In [None]:
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions4})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

### Saving submission file

In [None]:
filename = 'CIS Fraud Detection under sampling xgb.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.892110
# Public score: 0.920813