Reference: https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419
> https://www.kaggle.com/roydatascience/light-gbm-with-complete-eda

> Please give your feedback

**Importing necessary library**

In [None]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

**Importing datasets**

In [None]:
sub = pd.read_csv("../input/ieee-fraud-detection/sample_submission.csv")

In [None]:
train_id = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
train_tr = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")

In [None]:
train_id.head(5)

In [None]:
train_id.head(5)

In [None]:
train_id.shape, train_tr.shape

In [None]:
test_id = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
test_tr = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")

In [None]:
test_id.shape, test_tr.shape

**Merging transaction and Identity **

In [None]:
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr
del train_id, train_tr
gc.collect()

**Negative Downsampling**

In [None]:
# Negative downsampling
train_pos = train[train['isFraud']==1]
train_neg = train[train['isFraud']==0]

train_neg = train_neg.sample(int(train_pos.shape[0] ), random_state=42)
train = pd.concat([train_pos,train_neg]).sort_index()

In [None]:
train_pos.shape, train_neg.shape

In [None]:
l = 2*int(train_pos.shape[0])

In [None]:
train.shape

In [None]:
train.head(2)

In [None]:
del train_pos
del train_neg

> From below we can see that there are a lot of features with almost 99% nan values

In [None]:
train.isna().sum()

> Sorting features on basis of TransactionDT

In [None]:
train = train.sort_values('TransactionDT')

**Taking all features**
> Initially I will start with all the features and then will drop most of the features on the basis of count

In [None]:
useful_features = [col for col in train.columns]

> From below we can see that length of features is 434

In [None]:
len(useful_features)

In [None]:
train.shape

In [None]:
train.isna().sum()

**Displaying all the columns**

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train.shape, test.shape

In [None]:
train.head(10)

In [None]:
target = train["isFraud"]
train.drop(["isFraud"], axis=1, inplace=True)

**Concatinating train and test as one dataframe**

In [None]:
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24


**Card feature**

In [None]:
train['uid'] = train['card1'].astype(str)+'_'+train['card2'].astype(str)
test['uid'] = test['card1'].astype(str)+'_'+test['card2'].astype(str)

train['uid1'] = train['uid'].astype(str)+'_'+train['card3'].astype(str)
test['uid1'] = test['uid'].astype(str)+'_'+test['card3'].astype(str)


train['uid2'] = train['uid'].astype(str)+'_'+train['card3'].astype(str)+'_'+train['card5'].astype(str)
test['uid2'] = test['uid'].astype(str)+'_'+test['card3'].astype(str)+'_'+test['card5'].astype(str)

train['uid3'] = train['uid2'].astype(str)+'_'+train['addr1'].astype(str)+'_'+train['addr2'].astype(str)
test['uid3'] = test['uid2'].astype(str)+'_'+test['addr1'].astype(str)+'_'+test['addr2'].astype(str)

train['uid4'] = train['card4'].astype(str)+'_'+train['card6'].astype(str)
test['uid4'] = test['card4'].astype(str)+'_'+test['card6'].astype(str)

train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)


**Id Feaures**

In [None]:
train['id'] = train['id_35'].astype(str)+'_'+train['id_36'].astype(str)
test['id'] = test['id_35'].astype(str)+'_'+test['id_36'].astype(str)

train['id1'] = train['id'].astype(str)+'_'+train['id_37'].astype(str)
test['id1'] = test['id'].astype(str)+'_'+test['id_37'].astype(str)


train['id2'] = train['id1'].astype(str)+'_'+train['id_38'].astype(str)
test['id2'] = test['id1'].astype(str)+'_'+test['id_38'].astype(str)


In [None]:
train = pd.concat([train,test])

In [None]:
train.head(3)

In [None]:
train.drop(["TransactionID", "TransactionDT"], axis=1, inplace=True)
train.shape

**Here I will treat all features as categorical except TransationAmt**

In [None]:
neglect = ["TransactionAmt", 'Transaction_day_of_week', 'Transaction_hour']

In [None]:
useful_features = [col for col in train.columns if col not in neglect]

In [None]:
train['M_na'] = abs(train.isna().sum(axis=1).astype(np.int8))

In [None]:
non_nan = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14",\
          "D1", "M_na"]

In [None]:
#"V96", "V97", "V98", "V99", "V100", "V101", "V102", "V103", "V104", "V105", "V106",\
#          "V107", "V108", "V109", "V110", "V111", "V112", "V113", "V114", "V115", "V116", "V117","V118",\
 #         "V119", "V120", "V121","V122","V123", "V124", "V125", "V126", "V127", "V128", "V129", "V130",\
  #        "V131", "V132", "V133", "V134", "V135", "V136", "V137", "V297", "V298", "V299", "V300",\
   #       "V301", "V301", "V302", "V303", "V304", "V305", "V306", "V307", "V308", "V309", "V310",\
    #      "V311", "V312", "V313", "V314", "V315", "V316", "V317", "V318", "V319", "V320", "V321",\
     #     "V279", "V280", "V281", "V282", "V283", "V284", "V285", "V286", "V287", "V288", "V289", "V290",\
      #    "V291", "V292", "V293", "V294", "V295", "V296"]

**This block of code count every features and drop original features**

In [None]:
i=0        
for feature in useful_features:
    
        # Count encoded separately for train and test
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    if feature not in non_nan:
        train.drop([feature], axis=1,inplace=True)
    print("Done" + str(i))
    i+=1
        

**Dropping below features as these seems to be repeating**

In [None]:
dropping =["D8_count_dist", "V138_count_dist", "V139_count_dist", "V140_count_dist", "V141_count_dist",\
           "V146_count_dist", "V147_count_dist", "V148_count_dist", "V149_count_dist", "V144_count_dist",\
           "V145_count_dist", "V150_count_dist", "V151_count_dist", "V152_count_dist", "V153_count_dist",\
           "V154_count_dist", "V155_count_dist", "V156_count_dist", "V157_count_dist", "V158_count_dist",\
           "V159_count_dist", "V160_count_dist", "V161_count_dist", "V162_count_dist", "V163_count_dist",\
           "V164_count_dist", "V165_count_dist", "V166_count_dist", "V168_count_dist", "V170_count_dist",\
           "V171_count_dist", "V172_count_dist", "V173_count_dist", "V174_count_dist", "V175_count_dist",\
           "V176_count_dist", "V177_count_dist", "V178_count_dist", "V179_count_dist", "V180_count_dist",\
           "V181_count_dist", "V182_count_dist", "V183_count_dist", "V184_count_dist", "V185_count_dist",\
           "V186_count_dist", "V187_count_dist", "V188_count_dist", "V189_count_dist", "V190_count_dist",\
           "V191_count_dist", "V192_count_dist", "V193_count_dist", "V194_count_dist", "V195_count_dist",\
           "V196_count_dist", "V197_count_dist", "V198_count_dist", "V199_count_dist", "V200_count_dist",\
           "V201_count_dist", "V202_count_dist", "V203_count_dist", "V204_count_dist", "V205_count_dist",\
           "V206_count_dist", "V207_count_dist", "V208_count_dist", "V209_count_dist", "V210_count_dist",\
           "V211_count_dist", "V212_count_dist", "V213_count_dist", "V214_count_dist", "V215_count_dist",\
           "V216_count_dist", "V218_count_dist", "V219_count_dist", "V221_count_dist", "V222_count_dist",\
           "V223_count_dist", "V224_count_dist", "V225_count_dist", "V226_count_dist", "V227_count_dist",\
           "V228_count_dist", "V229_count_dist", "V230_count_dist", "V231_count_dist", "V232_count_dist",\
           "V233_count_dist", "V234_count_dist", "V235_count_dist", "V236_count_dist", "V237_count_dist",\
           "V205_count_dist", "V205_count_dist", "V205_count_dist", "V205_count_dist", "V205_count_dist",\
           "V238_count_dist", "V239_count_dist", "V240_count_dist", "V241_count_dist", "V242_count_dist",\
           "V243_count_dist", "V244_count_dist", "V245_count_dist","V246_count_dist", "V247_count_dist",\
           "V248_count_dist", "V249_count_dist", "V250_count_dist", "V251_count_dist", "V252_count_dist",\
           "V253_count_dist", "V254_count_dist", "V255_count_dist", "V256_count_dist", "V257_count_dist",\
           "V258_count_dist", "V259_count_dist", "V260_count_dist", "V261_count_dist", "V262_count_dist",\
           "V263_count_dist", "V264_count_dist", "V265_count_dist", "V266_count_dist", "V267_count_dist",\
           "V268_count_dist", "V269_count_dist", "V270_count_dist", "V271_count_dist", "V272_count_dist",\
           "V273_count_dist", "V274_count_dist", "V275_count_dist", "V276_count_dist", "V277_count_dist",\
           "V278_count_dist", "V323_count_dist", "V324_count_dist", "V325_count_dist", "V326_count_dist",\
           "V327_count_dist", "V328_count_dist", "V329_count_dist", "V330_count_dist", "V331_count_dist",\
           "V332_count_dist", "V333_count_dist", "V334_count_dist", "V335_count_dist", "V336_count_dist",\
           "V237_count_dist", "V238_count_dist", "V239_count_dist", "id_04_count_dist", "id_06_count_dist",\
           "id_08_count_dist", "id_10_count_dist", "id_22_count_dist", "id_27_count_dist", "id_29_count_dist",\
           "id_36_count_dist", "id_37_count_dist", "id_38_count_dist"]


In [None]:
train = train.drop(dropping, axis=1)

In [None]:
len(dropping)

**Log**

In [None]:
#useful_features = [col for col in train.columns]
#for feature in useful_features:
    
        # Count encoded separately for train and test
#    train[feature] = np.log(train[feature])
    
    
    

> Below we can see that all I am left with is count

In [None]:
train.head(4)

In [None]:
train.shape

In [None]:
#X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)#
#y = train.sort_values('TransactionDT')['isFraud']
#test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [None]:
train.isna().sum()

In [None]:
train.fillna(0, inplace = True)

In [None]:
#del train
#gc.collect()

**Again seperating data into train and test**

In [None]:
X = train.iloc[:l, :]
test = train.iloc[l:, :]

In [None]:
y=target

**Train test and split**

In [None]:
# Training and Validation Set
#from sklearn.model_selection import train_test_split
#X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.20, random_state=23)

> **Lightgbm**

In [None]:
from catboost import CatBoostRegressor
categorical_var = np.where(train.dtypes != np.float)[0]
print('\nCategorical Variables indices : ',categorical_var)

In [None]:
del train

In [None]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
          "n_jobs" : -1
         }

In [None]:
folds = TimeSeriesSplit(n_splits=10)

aucs = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    start_time = time()
    print('Training on fold {}'.format(fold + 1))
    
    trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
    clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
    feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
    aucs.append(clf.best_score['valid_1']['auc'])
    
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
print('Mean AUC:', np.mean(aucs))
print('-' * 30)

In [None]:
feature_importances['average'] = feature_importances[['fold_{}'.format(fold + 1) for fold in range(folds.n_splits)]].mean(axis=1)
feature_importances.to_csv('feature_importances.csv')

plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
# clf right now is the last model, trained with 80% of data and validated with 20%
best_iter = clf.best_iteration

**Submission**

In [None]:
clf = lgb.LGBMClassifier(**params, num_boost_round=best_iter)
clf.fit(X, y)

In [None]:
sub['isFraud'] = clf.predict_proba(test)[:, 1]

In [None]:
sub.to_csv('ieee_cis_fraud_detection_new.csv', index=False)

> thank you all please let me know where did I go wrong.
> Thankyou