In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import ensemble,metrics,cross_decomposition,linear_model,model_selection,preprocessing
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
trainIdentity=pd.read_csv('train_identity.csv')
trainTranscation=pd.read_csv('train_transaction.csv')
trainIdentity.shape,trainTranscation.shape

In [None]:
testIdentity=pd.read_csv('test_identity.csv')
testTranscation=pd.read_csv('test_transaction.csv')
testIdentity.shape,testTranscation.shape

In [None]:
combinedData=pd.merge(left=trainTranscation,right=trainIdentity,on='TransactionID',how='left')
combinedDataTest=pd.merge(left=testTranscation,right=testIdentity,on='TransactionID',how='left')

In [None]:
del testIdentity,testTranscation,trainIdentity,trainTranscation

In [None]:
[(2826, 'card1'),
 (2481, 'card2'),
 (1865, 'addr1'),
 (1801, 'TransactionAmt'),
 (1425, 'C13'),
 (1198, 'D15'),
 (1162, 'D2'),
 (1097, 'C1'),
 (1056, 'P_emaildomain'),
 (1024, 'D1'),
 (982, 'D10'),
 (977, 'card5'),
 (958, 'D4'),
 (884, 'transDay'),
 (851, 'dist1'),
 (842, 'C14'),
 (796, 'C11'),
 (794, 'transMonth'),
 (766, 'C2'),
 (753, 'D8')]

impVar=['card1', 'card2', 'addr1', 'TransactionAmt', 'C13', 'D15', 'D2', 'C1', 'P_emaildomain', 'D1', 'D10', 'card5', 'D4', 'transDay', 'dist1', 'C14', 'C11', 'transMonth', 'C2', 'D8']

In [None]:
combinedDataTest['diffName']='test'
combinedData['diffName']='train'

newSetData=pd.concat([combinedData,combinedDataTest],axis=0)

In [None]:
cardVsTransAmTest1=combinedData.groupby('card1').agg({'TransactionAmt':{'Card1VsTrans':'mean'}})
cardVsTransAmTest1.columns = cardVsTransAmTest1.columns.droplevel(level=0)
cardVsTransAmTest1=cardVsTransAmTest1.reset_index()
cardVsTransAmTest1Dict={cardVsTransAmTest1['card1'].iloc[i]:cardVsTransAmTest1['Card1VsTrans'].iloc[i] for i in range(cardVsTransAmTest1.shape[0])}

cardVsTransAmTest1Std=combinedData.groupby('card1').agg({'TransactionAmt':{'Card1VsTransStd':'std'}})
cardVsTransAmTest1Std.columns = cardVsTransAmTest1Std.columns.droplevel(level=0)
cardVsTransAmTest1Std=cardVsTransAmTest1Std.reset_index()
cardVsTransAmTest1StdDict={cardVsTransAmTest1Std['card1'].iloc[i]:cardVsTransAmTest1Std['Card1VsTransStd'].iloc[i] for i in range(cardVsTransAmTest1Std.shape[0])}

cardVsTransAmTest4=combinedData.groupby('card4').agg({'TransactionAmt':{'Card4VsTrans':'mean'}})
cardVsTransAmTest4.columns = cardVsTransAmTest4.columns.droplevel(level=0)
cardVsTransAmTest4=cardVsTransAmTest4.reset_index()
cardVsTransAmTest4Dict={cardVsTransAmTest4['card4'].iloc[i]:cardVsTransAmTest4['Card4VsTrans'].iloc[i] for i in range(cardVsTransAmTest4.shape[0])}

cardVsTransAmTest4Std=combinedData.groupby('card4').agg({'TransactionAmt':{'Card4VsTransStd':'std'}})
cardVsTransAmTest4Std.columns = cardVsTransAmTest4Std.columns.droplevel(level=0)
cardVsTransAmTest4Std=cardVsTransAmTest4Std.reset_index()
cardVsTransAmTest4StdDict={cardVsTransAmTest4Std['card4'].iloc[i]:cardVsTransAmTest4Std['Card4VsTransStd'].iloc[i] for i in range(cardVsTransAmTest4Std.shape[0])}

In [None]:
len(cardVsTransAmTest1Dict),len(cardVsTransAmTest1StdDict),len(cardVsTransAmTest4Dict),len(cardVsTransAmTest4StdDict)

In [None]:
def browserDef(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if 'samsung' in brRec:
            return 'Samsung Browser'
        elif 'mobile safari' in brRec:
            return 'Mobile Safari'
        elif 'chrome' in brRec:
            return 'Chrome Browser'
        elif 'edge' in brRec:
            return 'Edge Browser'
        elif 'ie' in brRec:
            return 'IE Browser'
        elif 'firefox' in brRec:
            return 'Firefox Browser'
        elif 'opera' in brRec:
            return 'Opera Browser'
        elif ('Android' in brRec) or ('android' in brRec):
            return 'Android Browser'
        elif 'Mozilla' in brRec:
            return 'Mozilla Browser'
        elif 'safari' in brRec:
            return 'Safari  Browser'
        elif 'google' in brRec:
            return 'Google Browser'
        else:
            return brRec

In [None]:
def deviceDef(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if 'Android' in brRec:
            return 'Android Device'
        elif 'iOS' in brRec:
            return 'iOS Device'
        elif 'Windows' in brRec:
            return 'Windows Device'
        elif 'Mac' in brRec:
            return 'Mac OS Device'
        else:
            return brRec

In [None]:
def screenReso(xRes):
    if str(xRes) == 'nan':
        return ('No info')
    else:
        widVal=int(xRes.split('x')[0])
        if widVal <=850:
            return ('Small Screen')
        elif widVal <=2050:
            return ('Med Screen')
        elif widVal <=2250:
            return ('2K Screen')
        elif widVal > 2250:
            return ('4K Screen')

In [None]:
def defGetCountryFromDomain(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('.mx' in brRec):
            return 'Mexico email'
        elif '.jp' in brRec:
            return 'Japan email'
        elif '.uk' in brRec:
            return 'UK email'
        elif '.de' in brRec:
            return 'Germany email'
        elif '.es' in brRec:
            return 'Spain email'
        elif '.fr' in brRec:
            return 'France email'
        elif '.com' in brRec:
            return 'Global email'
        elif '.net' in brRec:
            return 'Net email'
        else:
            return 'Other'

In [None]:
# #added on 2nd trial
# combinedData['id_33']=combinedData['id_33'].apply(lambda x: screenReso(x))
# #added on 3rd trial
# # combinedData['id_31']=combinedData['id_31'].apply(lambda x: browserDef(x))
# #added on 4th trial
# #combinedData['id_30']=combinedData['id_30'].apply(lambda x: deviceDef(x))

# combinedData['CountryDomain']=combinedData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))
# # combinedData['R_emaildomain']=combinedData['R_emaildomain'].fillna('Other')
# # combinedData['P_emaildomain']=combinedData['P_emaildomain'].fillna('Other')

In [None]:
def defCard6(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('debit or credit' in brRec):
            return 'debit'
        else:
            return (brRec)
        


In [None]:
newSetData['id_33']=newSetData['id_33'].apply(lambda x: screenReso(x))
newSetData['id_31']=newSetData['id_31'].apply(lambda x: browserDef(x))
newSetData['id_30']=newSetData['id_30'].apply(lambda x: deviceDef(x))
newSetData['card6']=newSetData['card6'].apply(lambda x: defCard6(x))
newSetData['CountryDomain']=newSetData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))

In [None]:
categoricalCola=['id_33','id_31','id_30','card6','CountryDomain']

getDummVar=pd.get_dummies(newSetData,columns=categoricalCola)

In [None]:
nP=np.percentile(getDummVar['TransactionAmt'],99)
getDummVar['TransactionAmt']=getDummVar['TransactionAmt'].apply(lambda x: nP if x >= nP else x)
# sns.distplot(combinedData['TransactionAmt'])
getDummVar.shape

In [None]:
getDummVar.head()

In [None]:
tempTimeSer=((getDummVar['TransactionDT']-86400)).map(int)
getDummVar['transSec']=tempTimeSer%60
getDummVar['transMin']=(tempTimeSer/60).map(int)%60
getDummVar['transHour']=(tempTimeSer/3600).map(int)%24
getDummVar['transMonth']=((tempTimeSer/86400)//30)+1
getDummVar['transDay']=((tempTimeSer/86400)%30).map(int)+1

In [None]:
del getDummVar['TransactionDT']
del getDummVar['TransactionID']

In [None]:
getDummVar.shape

In [None]:
trainDataPart=getDummVar[getDummVar['diffName']=='train']
testDataPart=getDummVar[getDummVar['diffName']=='test']
del testDataPart['isFraud']
del getDummVar
del newSetData

In [None]:
del trainDataPart['diffName']
del testDataPart['diffName']

In [None]:
y_train = trainDataPart['isFraud'].copy()
# del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
X_train = trainDataPart.drop('isFraud', axis=1)
X_test = testDataPart.copy()

del trainDataPart, testDataPart

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

In [None]:
X_train.head()

In [None]:
X_train.shape,X_test.shape
#((590540, 436), (506691, 436))
#((590540, 436), (506691, 436))
#((590540, 449), (506691, 450))
#((590540, 496), (506691, 496))

In [None]:
[i for i in list(X_test.columns)  if i not in list(X_train.columns)]

In [None]:
import lightgbm as lgb

In [None]:
X_train['isFraud']=y_train

In [None]:
trainPart=X_train[X_train['transMonth']<=4]
valPart=X_train[X_train['transMonth']==5]
testPart=X_train[X_train['transMonth']>5]
trainPart.shape,valPart.shape,testPart.shape

In [None]:
len((pd.unique(trainPart['C1'])).astype(int))

In [None]:
trainPart[impVar].head(100)

## For train Val part

In [None]:
# cardVsTransAmtrainPart=trainPart.groupby('card1').agg({'TransactionAmt':{'Card1VsTrans':'mean'}})
# cardVsTransAmtrainPart.columns = cardVsTransAmtrainPart.columns.droplevel(level=0)
# cardVsTransAmtrainPart=cardVsTransAmtrainPart.reset_index()
# cardVsTransAmtrainPartDict={cardVsTransAmtrainPart['card1'].iloc[i]:cardVsTransAmtrainPart['Card1VsTrans'].iloc[i] for i in range(cardVsTransAmtrainPart.shape[0])}

# cardVsTransAmtrainPartStd1=trainPart.groupby('card1').agg({'TransactionAmt':{'Card1VsTransStd':'std'}})
# cardVsTransAmtrainPartStd1.columns = cardVsTransAmtrainPartStd1.columns.droplevel(level=0)
# cardVsTransAmtrainPartStd1=cardVsTransAmtrainPartStd1.reset_index()
# cardVsTransAmtrainPartStd1Dict={cardVsTransAmtrainPartStd1['card1'].iloc[i]:cardVsTransAmtrainPartStd1['Card1VsTransStd'].iloc[i] for i in range(cardVsTransAmtrainPartStd1.shape[0])}

# cardVsTransAmtrainPart4=trainPart.groupby('card4').agg({'TransactionAmt':{'Card4VsTrans':'mean'}})
# cardVsTransAmtrainPart4.columns = cardVsTransAmtrainPart4.columns.droplevel(level=0)
# cardVsTransAmtrainPart4=cardVsTransAmtrainPart4.reset_index()
# cardVsTransAmtrainPart4Dict={cardVsTransAmtrainPart4['card4'].iloc[i]:cardVsTransAmtrainPart4['Card4VsTrans'].iloc[i] for i in range(cardVsTransAmtrainPart4.shape[0])}

# cardVsTransAmtrainPart4Std4=trainPart.groupby('card4').agg({'TransactionAmt':{'Card4VsTransStd':'std'}})
# cardVsTransAmtrainPart4Std4.columns = cardVsTransAmtrainPart4Std4.columns.droplevel(level=0)
# cardVsTransAmtrainPart4Std4=cardVsTransAmtrainPart4Std4.reset_index()
# cardVsTransAmtrainPart4Std4Dict={cardVsTransAmtrainPart4Std4['card4'].iloc[i]:cardVsTransAmtrainPart4Std4['Card4VsTransStd'].iloc[i] for i in range(cardVsTransAmtrainPart4Std4.shape[0])}


In [None]:
# len(cardVsTransAmtrainPartDict),len(cardVsTransAmtrainPartStd1Dict),len(cardVsTransAmtrainPart4Dict),len(cardVsTransAmtrainPart4Std4Dict)

In [None]:
# trainPart['Card1VsTrans']=trainPart['card1'].map(cardVsTransAmtrainPartDict)
# trainPart['Card1VsTransStd']=trainPart['card1'].map(cardVsTransAmtrainPartStd1Dict)
# trainPart['Card4VsTrans']=trainPart['card4'].map(cardVsTransAmtrainPart4Dict)
# trainPart['Card4VsTransStd']=trainPart['card4'].map(cardVsTransAmtrainPart4Std4Dict)


In [None]:
# valPart['Card1VsTrans']=valPart['card1'].map(cardVsTransAmtrainPartDict)
# valPart['Card1VsTransStd']=valPart['card1'].map(cardVsTransAmtrainPartStd1Dict)
# valPart['Card4VsTrans']=valPart['card4'].map(cardVsTransAmtrainPart4Dict)
# valPart['Card4VsTransStd']=valPart['card4'].map(cardVsTransAmtrainPart4Std4Dict)

## For testpart local

In [None]:
# forCalFeatureData=trainPart.append(valPart)

In [None]:
# cardVsTransAmtestLocal1=forCalFeatureData.groupby('card1').agg({'TransactionAmt':{'Card1VsTrans':'mean'}})
# cardVsTransAmtestLocal1.columns = cardVsTransAmtestLocal1.columns.droplevel(level=0)
# cardVsTransAmtestLocal1=cardVsTransAmtestLocal1.reset_index()
# cardVsTransAmtestLocal1Dict={cardVsTransAmtestLocal1['card1'].iloc[i]:cardVsTransAmtestLocal1['Card1VsTrans'].iloc[i] for i in range(cardVsTransAmtestLocal1.shape[0])}

# cardVsTransAmtestLocal1Std1=forCalFeatureData.groupby('card1').agg({'TransactionAmt':{'Card1VsTransStd':'std'}})
# cardVsTransAmtestLocal1Std1.columns = cardVsTransAmtestLocal1Std1.columns.droplevel(level=0)
# cardVsTransAmtestLocal1Std1=cardVsTransAmtestLocal1Std1.reset_index()
# cardVsTransAmtestLocalStd1Dict={cardVsTransAmtestLocal1Std1['card1'].iloc[i]:cardVsTransAmtestLocal1Std1['Card1VsTransStd'].iloc[i] for i in range(cardVsTransAmtestLocal1Std1.shape[0])}

# cardVsTransAmtestLocal4=forCalFeatureData.groupby('card4').agg({'TransactionAmt':{'Card4VsTrans':'mean'}})
# cardVsTransAmtestLocal4.columns = cardVsTransAmtestLocal4.columns.droplevel(level=0)
# cardVsTransAmtestLocal4=cardVsTransAmtestLocal4.reset_index()
# cardVsTransAmtestLocal4Dict={cardVsTransAmtestLocal4['card4'].iloc[i]:cardVsTransAmtestLocal4['Card4VsTrans'].iloc[i] for i in range(cardVsTransAmtestLocal4.shape[0])}

# cardVsTransAmtestLocal4Std4=forCalFeatureData.groupby('card4').agg({'TransactionAmt':{'Card4VsTransStd':'std'}})
# cardVsTransAmtestLocal4Std4.columns = cardVsTransAmtestLocal4Std4.columns.droplevel(level=0)
# cardVsTransAmtestLocal4Std4=cardVsTransAmtestLocal4Std4.reset_index()
# cardVsTransAmtrainPart4Std4Dict={cardVsTransAmtestLocal4Std4['card4'].iloc[i]:cardVsTransAmtestLocal4Std4['Card4VsTransStd'].iloc[i] for i in range(cardVsTransAmtestLocal4Std4.shape[0])}


In [None]:
# del forCalFeatureData

In [None]:
# testPart['Card1VsTrans']=trainPart['card1'].map(cardVsTransAmtestLocal1Dict)
# testPart['Card1VsTransStd']=trainPart['card1'].map(cardVsTransAmtestLocalStd1Dict)
# testPart['Card4VsTrans']=trainPart['card4'].map(cardVsTransAmtestLocal4Dict)
# testPart['Card4VsTransStd']=trainPart['card4'].map(cardVsTransAmtrainPart4Std4Dict)

In [None]:
toUseCol=list(trainPart.columns)
toUseCol.remove('isFraud')
target='isFraud'

In [None]:
# trainPart[['card4','Card4VsTrans']].head()

In [None]:
# cardVsTransAmtrainPart4Dict

In [None]:
trainData=lgb.Dataset(trainPart[toUseCol],trainPart[target])
valData=lgb.Dataset(valPart[toUseCol],valPart[target])

In [None]:
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
#         'metric':metrics,
        'learning_rate': 0.01,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':'auc'
    }

num_boost_round=8000
early_stopping_rounds=10


In [None]:
model3 = lgb.train(lgb_params, 
                     trainData, 
                     valid_sets=[trainData, valData], 
                   valid_names=['train','valid'],
#                      evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=50)

In [None]:
kakDict={k:j for k,j in zip(model3.feature_importance(),model3.feature_name())}

In [None]:
[kakDict[i] for i in sorted(kakDict,reverse=True)][:20]

In [None]:
kakDict

In [None]:
from sklearn import ensemble,metrics

In [None]:
predTrain3=model3.predict(trainPart[toUseCol])
scoreOftrain3=metrics.roc_auc_score(trainPart[target],predTrain3)
valPred3=model3.predict(valPart[toUseCol])
scoreOfVal3=metrics.roc_auc_score(valPart[target],valPred3)
testPred3=model3.predict(testPart[toUseCol])
scoreOfTest3=metrics.roc_auc_score(testPart[target],testPred3)
print('ROC AUC for train {} and for validation {} for test {}'.format(scoreOftrain3,scoreOfVal3,scoreOfTest3))

In [None]:
#1st ROC AUC for train 0.9517255925054339 and for validation 0.9114527600700442 for test 0.9018045350670539
#2nd ROC AUC for train 0.9554808315356411 and for validation 0.9136923607565761 for test 0.9023918756643375
#3rd ROC AUC for train 0.9540256532402678 and for validation 0.9124955852919424 for test 0.9021716726534883
#4th ROC AUC for train 0.9519575151753169 and for validation 0.9111383915618384 for test 0.9025403905942307
#5th ROC AUC for train 0.974518437878972 and for validation 0.922009759846402 for test 0.9049750781614286
#6th ROC AUC for train 0.9730115102991304 and for validation 0.9222004317665984 for test 0.9040898320555035
#7th ROC AUC for train 0.9763890969670054 and for validation 0.923294301337268 for test 0.905121158020004
#8th ROC AUC for train 0.9829982642003638 and for validation 0.92727537802398 for test 0.896619898435757
#9th ROC AUC for train 0.9851442813660034 and for validation 0.9268007129604092 for test 0.9041793505798761

In [None]:
def deviceInfoDef(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('SAMSUNG' in brRec) or ('SM' in brRec) or ('GT' in brRec):
            return 'SAMSUNG Device'
        elif 'iOS' in brRec:
            return 'iOS Device'
        elif 'Windows' in brRec:
            return 'Windows Device'
        elif 'LG' in brRec:
            return 'LG Device'
        elif 'HUAWEI' in brRec:
            return 'HUAWEI Device'
        elif 'MacOS' in brRec:
            return 'MacOS Device'
#         elif 'Lenovo' in brRec:
#             return 'Lenovo Device'
#         elif 'Redmi' in brRec:
#             return 'Redmi Device'
        elif ('Moto' in brRec) or ('moto' in brRec):
            return 'Moto Device'
        elif ('Nexus' in brRec) or ('Pixel' in brRec):
            return 'MacOS Device'
#         elif 'HTC' in brRec:
#             return 'HTC Device'
#         elif 'Android' in brRec:
#             return 'Android Device'
#         elif 'Blade' in brRec:
#             return 'Blade Device'
#         elif ('Nokia' in brRec) or ('NOKIA' in brRec):
#             return 'Nokia Device'
#         elif ('ASUS' in brRec) or ('Asus' in brRec):
#             return 'ASUS Device'
        else:
            return 'Other'

In [None]:
combinedData['CountryDomain']=combinedData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))
combinedData['R_emaildomain']=combinedData['R_emaildomain'].fillna('Other')
combinedData['P_emaildomain']=combinedData['P_emaildomain'].fillna('Other')
combinedData['DeviceInfo']=combinedData['DeviceInfo'].apply(lambda x: deviceInfoDef(x))
combinedData['id_33']=combinedData['id_33'].apply(lambda x: screenReso(x))
combinedData['id_31']=combinedData['id_31'].apply(lambda x: browserDef(x))
combinedData['id_30']=combinedData['id_30'].apply(lambda x: deviceDef(x))

In [None]:
nP=np.percentile(combinedData['TransactionAmt'],99)
combinedData['TransactionAmt']=combinedData['TransactionAmt'].apply(lambda x: nP if x >= nP else x)
# sns.distplot(combinedData['TransactionAmt'])

In [None]:
combinedData.head()

## For C columns

In [None]:
groupC=['C'+str(i) for i in range(1,15)]
from sklearn import decomposition,preprocessing
pp1=preprocessing.StandardScaler()
pp1.fit(combinedData[groupC])
newGroupCdata=pp1.transform(combinedData[groupC])
pcaGroupC.fit(newGroupCdata)
pcaGroupC=decomposition.PCA()
sum(pcaGroupC.explained_variance_ratio_[:2])
pcaCColumns=['pcaC1','pcaC2']
pcaForGroupC=pd.DataFrame(pcaGroupC.fit_transform(newGroupCdata)[::,:2],columns=pcaCColumns)

## For D columns

In [None]:
groupD=['D'+str(i) for i in range(1,16)]

In [None]:
for i in groupD:
    print (len(pd.unique(combinedData[i])))

In [None]:
for i in groupD:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [None]:
pp2=preprocessing.StandardScaler()
pp2.fit(combinedData[groupD])
newGroupDdata=pp2.transform(combinedData[groupD])

In [None]:
pcaGroupD=decomposition.PCA()
pcaGroupD.fit(newGroupDdata)

In [None]:
sum(pcaGroupD.explained_variance_ratio_[:9])

In [None]:
pcaDColumns=['pcaD'+str(i) for i in range(1,10)]
pcaForGroupD=pd.DataFrame(pcaGroupD.fit_transform(newGroupDdata)[::,:9],columns=pcaDColumns)

## For Columns V

In [None]:
groupV=['V'+str(i) for i in range(1,340)]

In [None]:
for i in groupV:
    print (len(pd.unique(combinedData[i])))

In [None]:
for i in groupV:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [None]:
pp3=preprocessing.StandardScaler()
pp3.fit(combinedData[groupV])
newGroupVdata=pp3.transform(combinedData[groupV])

In [None]:
pcaGroupV=decomposition.PCA()
pcaGroupV.fit(newGroupVdata)

In [None]:
sum(pcaGroupV.explained_variance_ratio_[:50])

In [None]:
pcaVColumns=['pcaV'+str(i) for i in range(1,51)]
pcaForGroupV=pd.DataFrame(pcaGroupD.fit_transform(newGroupVdata)[::,:50],columns=pcaVColumns)

In [None]:
combinedData.head()

## Group Columns for ID

In [None]:
groupID=['id_'+str(i).zfill(2) for i in range(1,12)]

In [None]:
for i in groupID:
    print (len(pd.unique(combinedData[i])))

In [None]:
for i in groupID:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [None]:
pp4=preprocessing.StandardScaler()
pp4.fit(combinedData[groupID])
newGroupIDdata=pp4.transform(combinedData[groupID])

In [None]:
pcaGroupID=decomposition.PCA()
pcaGroupID.fit(newGroupIDdata)

In [None]:
sum(pcaGroupID.explained_variance_ratio_[:8])

In [None]:
pcaIDColumns=['pcaID'+str(i) for i in range(1,9)]
pcaForGroupID=pd.DataFrame(pcaGroupID.fit_transform(newGroupIDdata)[::,:8],columns=pcaIDColumns)

## Process of the data

In [None]:

combinedData['id_26']=combinedData['id_26'].apply(lambda x: numDef(x))
combinedData['id_25']=combinedData['id_25'].apply(lambda x: numDef25(x))
combinedData['id_21']=combinedData['id_21'].apply(lambda x: numDef21(x))
combinedData['id_20']=combinedData['id_20'].apply(lambda x: numDef21(x))
combinedData['id_19']=combinedData['id_19'].apply(lambda x: numDef21(x))
combinedData['id_17']=combinedData['id_17'].apply(lambda x: numDef(x))
combinedData['id_13']=combinedData['id_13'].apply(lambda x: numDef13(x))

combinedData['id_14']=combinedData['id_14'].apply(lambda x: numDef14(x))
combinedData['id_22']=combinedData['id_22'].apply(lambda x: numDef22(x))
combinedData['id_24']=combinedData['id_24'].apply(lambda x: numDef24(x))
combinedData['id_18']=combinedData['id_18'].apply(lambda x: numDef18(x))
combinedData['id_32']=combinedData['id_32'].apply(lambda x: numDef32(x))


combinedData['card4']=combinedData['card4'].fillna('Other')


In [None]:
# combinedData['id_33']=combinedData['id_33'].apply(lambda x: screenReso(x))
# combinedData['id_31']=combinedData['id_31'].apply(lambda x: browserDef(x))
# combinedData['id_30']=combinedData['id_30'].apply(lambda x: deviceDef(x))
# combinedData['id_26']=combinedData['id_26'].apply(lambda x: numDef(x))
# combinedData['id_25']=combinedData['id_25'].apply(lambda x: numDef25(x))
# combinedData['id_21']=combinedData['id_21'].apply(lambda x: numDef21(x))
# combinedData['id_20']=combinedData['id_20'].apply(lambda x: numDef21(x))
# combinedData['id_19']=combinedData['id_19'].apply(lambda x: numDef21(x))
# combinedData['id_17']=combinedData['id_17'].apply(lambda x: numDef(x))
# combinedData['id_13']=combinedData['id_13'].apply(lambda x: numDef13(x))
# combinedData['DeviceInfo']=combinedData['DeviceInfo'].apply(lambda x: deviceInfoDef(x))
# combinedData['id_14']=combinedData['id_14'].apply(lambda x: numDef14(x))
# combinedData['id_22']=combinedData['id_22'].apply(lambda x: numDef22(x))
# combinedData['id_24']=combinedData['id_24'].apply(lambda x: numDef24(x))
# combinedData['id_18']=combinedData['id_18'].apply(lambda x: numDef18(x))
# combinedData['id_32']=combinedData['id_32'].apply(lambda x: numDef32(x))

# combinedData['CountryDomain']=combinedData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))
# combinedData['R_emaildomain']=combinedData['R_emaildomain'].fillna('Other')
# combinedData['P_emaildomain']=combinedData['P_emaildomain'].fillna('Other')
# combinedData['card4']=combinedData['card4'].fillna('Other')
# combinedData['card6']=combinedData['card6'].apply(lambda x: defCard6(x))

In [None]:
categoricalCombinedData=['DeviceType','DeviceInfo']+['id_'+str(i) for i in range(12,39)]+\
                        ['M'+str(i) for i in range(1,10)]+\
                        ['ProductCD','P_emaildomain', 'R_emaildomain','CountryDomain','card4', 'card6',]#+\
                        #+['addr1','card1', 'card2', 'card3','card5', 'addr2']
    
leaveCols=['TransactionID', 'isFraud', 'TransactionDT']+\
            ['addr1','card1', 'card2', 'card3','card5', 'addr2']+\
            [ 'TransactionAmt',]
# tonormalize=['addr1','card1', 'card2', 'card3','card5', 'addr2']

In [None]:
for j in categoricalCombinedData:
    print ('>>>>>>>>>>>',j.zfill(12),'>>>>',len(pd.unique(combinedData[j])))

In [None]:
# towork='id_18'
# fig, ax = plt.subplots(figsize=(15,5))
# sns.countplot(trainIdentity[towork].fillna(0))
# # sns.countplot(trainIdentity['id_34'].fillna(0),ax=ax)

# trainIdentity['id_18']=trainIdentity['id_18'].apply(lambda x: numDef18(x))
# len(pd.unique(trainIdentity[towork]))
# list(pd.unique(trainIdentity[towork]))

In [None]:
cateDataTrainCombined=pd.get_dummies(combinedData[categoricalCombinedData],prefix_sep='_',drop_first='True')

In [None]:
toleaveTrainCombined=combinedData[leaveCols].fillna(-9999)
cateDataTrainCombined=cateDataTrainCombined.fillna('Other')

In [None]:
proProcess1=preprocessing.MinMaxScaler()
toleaveTrainCombined['addr1']=proProcess1.fit_transform(toleaveTrainCombined[['addr1']])

In [None]:
proProcess2=preprocessing.MinMaxScaler()
toleaveTrainCombined['card1']=proProcess2.fit_transform(toleaveTrainCombined[['card1']])

In [None]:
proProcess3=preprocessing.MinMaxScaler()
toleaveTrainCombined['card2']=proProcess3.fit_transform(toleaveTrainCombined[['card2']])

In [None]:
proProcess4=preprocessing.MinMaxScaler()
toleaveTrainCombined['card3']=proProcess4.fit_transform(toleaveTrainCombined[['card3']])

In [None]:
proProcess5=preprocessing.MinMaxScaler()
toleaveTrainCombined['card5']=proProcess5.fit_transform(toleaveTrainCombined[['card5']])

In [None]:
proProcess6=preprocessing.MinMaxScaler()
toleaveTrainCombined['addr2']=proProcess6.fit_transform(toleaveTrainCombined[['addr2']])

In [None]:
# toleaveTrainCombined[['addr1','card1', 'card2', 'card3','card5', 'addr2']]

In [None]:
# towork='addr2'
# # fig, ax = plt.subplots(figsize=(15,5))
# # sns.countplot(trainTranscation[towork].fillna(0))
# # sns.countplot(trainIdentity['id_34'].fillna(0),ax=ax)

# trainTranscation['card6']=trainTranscation['card6'].apply(lambda x: defCard6(x))
# len(pd.unique(trainTranscation[towork]))
# list(pd.unique(trainTranscation[towork]))

In [None]:
processedTrainTransaction=pd.concat([toleaveTrainCombined,cateDataTrainCombined,\
                                     pcaForGroupID,pcaForGroupV,pcaForGroupD,pcaForGroupC,\
                                    combinedData[groupID],combinedData[groupV],\
                                     combinedData[groupC],combinedData[groupD]],axis=1)
processedTrainTransaction.shape

In [None]:
tempTimeSer=((processedTrainTransaction['TransactionDT']-86400)).map(int)
processedTrainTransaction['transSec']=tempTimeSer%60
processedTrainTransaction['transMin']=(tempTimeSer/60).map(int)%60
processedTrainTransaction['transHour']=(tempTimeSer/3600).map(int)%24
processedTrainTransaction['transMonth']=((tempTimeSer/86400)//30)+1
processedTrainTransaction['transDay']=((tempTimeSer/86400)%30).map(int)+1

In [None]:
del processedTrainTransaction['TransactionDT']
del processedTrainTransaction['TransactionID']

In [None]:
processedTrainTransaction.tail()

In [None]:
# sns.countplot(processedTrainTransaction['isFraud'])
# # sns.countplot(processedTrainTransaction['transMonth'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transMonth'])
# sns.countplot(processedTrainTransaction['transMonth'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transDay'])
# sns.countplot(processedTrainTransaction['transDay'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transHour'])
# sns.countplot(processedTrainTransaction['transHour'],hue=processedTrainTransaction['isFraud'])

In [None]:
# sns.countplot(processedTrainTransaction['transMin'])

In [None]:
# sns.countplot(processedTrainTransaction['transSec'])

In [None]:
target=['isFraud']
toUseCol=list(processedTrainTransaction.columns)
toUseCol.remove(target[0])

In [None]:
from collections import Counter 
Counter(toUseCol)

len(toUseCol),len(set(toUseCol))

In [None]:
trainPart=processedTrainTransaction[processedTrainTransaction['transMonth']<=5]
# valPart=processedTrainTransaction[processedTrainTransaction['transMonth']==5]
testPart=processedTrainTransaction[processedTrainTransaction['transMonth']>5]
trainPart.shape,testPart.shape#,valPart.shape

In [None]:
from sklearn import ensemble,metrics
import xgboost as xgb

In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = trainPart[toUseCol]
y = trainPart[target]
tscv = TimeSeriesSplit(n_splits=5)
tscv

In [None]:
import lightgbm as lgb

In [None]:
clfList=[]
scoreList=[]
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
#     params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
#                 'max_depth': 5, 'alpha': 10,'max_depth': 2}
    
#     clf = xgb.XGBClassifier(
#         n_estimators=300, random_state=4,
# #         tree_method='gpu_hist',
#         **params
#     )
    
#     clf.fit(X_train, y_train)

    trainData=lgb.Dataset(X_train[toUseCol],y_train[target])
    valData=lgb.Dataset(X_test[toUseCol],y_test[target])
    lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
    #         'metric':metrics,
            'learning_rate': 0.01,
            #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
            'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
            'max_depth': -1,  # -1 means no limit
            'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
            'max_bin': 255,  # Number of bucketed bin for feature values
            'subsample': 0.6,  # Subsample ratio of the training instance.
            'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
            'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
            'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
            'subsample_for_bin': 200000,  # Number of samples for constructing bin
            'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
            'reg_alpha': 0,  # L1 regularization term on weights
            'reg_lambda': 0,  # L2 regularization term on weights
            'nthread': 4,
            'verbose': 0,
            'metric':'auc'
        }

    num_boost_round=1000
    early_stopping_rounds=10

    model3 = lgb.train(lgb_params, 
                         trainData, 
                         valid_sets=[trainData, valData], 
                       valid_names=['train','valid'],
    #                      evals_result=evals_results, 
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=50)
    
    y_pred_train = model3.predict(X_test)#[:,1]
    score = metrics.roc_auc_score(y_test, y_pred_train)
    clfList.append(model3)
    scoreList.append(y_pred_train)
    print(f'ROC AUC {score}')

In [None]:
testScoreList=[]
for mo in clfList:
    scr=mo.predict(testPart[toUseCol])
    testScoreList.append(scr)
    print (metrics.roc_auc_score( testPart[target],scr))

In [None]:
scoreTestNewMean=pd.DataFrame(testScoreList).transpose().mean(axis=1)

In [None]:
metrics.roc_auc_score( testPart[target],scoreTestNewMean)

In [None]:
testPart[toUseCol].head()

In [None]:
# lgb_params = {
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
# #         'metric':metrics,
#         'learning_rate': 0.01,
#         #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
#         'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
#         'max_depth': -1,  # -1 means no limit
#         'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
#         'max_bin': 255,  # Number of bucketed bin for feature values
#         'subsample': 0.6,  # Subsample ratio of the training instance.
#         'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
#         'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
#         'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#         'subsample_for_bin': 200000,  # Number of samples for constructing bin
#         'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
#         'reg_alpha': 0,  # L1 regularization term on weights
#         'reg_lambda': 0,  # L2 regularization term on weights
#         'nthread': 4,
#         'verbose': 0,
#         'metric':'auc'
#     }

# num_boost_round=3000
# early_stopping_rounds=10

# model3 = lgb.train(lgb_params, 
#                      trainData, 
#                      valid_sets=[trainData, valData], 
#                    valid_names=['train','valid'],
# #                      evals_result=evals_results, 
#                      num_boost_round=num_boost_round,
#                      early_stopping_rounds=early_stopping_rounds,
#                      verbose_eval=10)

In [None]:
predTrain3=bst.predict(trainPart[toUseCol])
scoreOftrain3=metrics.roc_auc_score(trainPart[target],predTrain3)
valPred3=bst.predict(valPart[toUseCol])
scoreOfVal3=metrics.roc_auc_score(valPart[target],valPred3)
testPred3=bst.predict(testPart[toUseCol])
scoreOfTest3=metrics.roc_auc_score(testPart[target],testPred3)
print('ROC AUC for train {} and for validation {} for test {}'.format(scoreOftrain3,scoreOfVal3,scoreOfTest3))

In [None]:
# ROC AUC for train 0.9543954336563735 and for validation 0.9114576197806739 for test 0.9040375652381945