In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import ensemble,metrics,cross_decomposition,linear_model,model_selection
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
trainIdentity=pd.read_csv('train_identity.csv')
trainTranscation=pd.read_csv('train_transaction.csv')
trainIdentity.shape,trainTranscation.shape

((144233, 41), (590540, 394))

In [3]:
combinedData=pd.merge(left=trainTranscation,right=trainIdentity,on='TransactionID',how='left')

In [4]:
def screenReso(xRes):
    if str(xRes) == 'nan':
        return ('No info')
    else:
        widVal=int(xRes.split('x')[0])
        if widVal <=850:
            return ('Small Screen')
        elif widVal <=2050:
            return ('Med Screen')
        elif widVal <=2250:
            return ('2K Screen')
        elif widVal > 2250:
            return ('4K Screen')

In [5]:
def browserDef(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if 'samsung' in brRec:
            return 'Samsung Browser'
        elif 'mobile safari' in brRec:
            return 'Mobile Safari'
        elif 'chrome' in brRec:
            return 'Chrome Browser'
        elif 'edge' in brRec:
            return 'Edge Browser'
        elif 'ie' in brRec:
            return 'IE Browser'
        elif 'firefox' in brRec:
            return 'Firefox Browser'
        elif 'opera' in brRec:
            return 'Opera Browser'
        elif ('Android' in brRec) or ('android' in brRec):
            return 'Android Browser'
        elif 'Mozilla' in brRec:
            return 'Mozilla Browser'
        elif 'safari' in brRec:
            return 'Safari  Browser'
        elif 'google' in brRec:
            return 'Google Browser'
        else:
            return brRec

In [6]:
def deviceDef(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if 'Android' in brRec:
            return 'Android Device'
        elif 'iOS' in brRec:
            return 'iOS Device'
        elif 'Windows' in brRec:
            return 'Windows Device'
        elif 'Mac' in brRec:
            return 'Mac OS Device'
        else:
            return brRec

In [7]:
def deviceInfoDef(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('SAMSUNG' in brRec) or ('SM' in brRec) or ('GT' in brRec):
            return 'SAMSUNG Device'
        elif 'iOS' in brRec:
            return 'iOS Device'
        elif 'Windows' in brRec:
            return 'Windows Device'
        elif 'LG' in brRec:
            return 'LG Device'
        elif 'HUAWEI' in brRec:
            return 'HUAWEI Device'
        elif 'MacOS' in brRec:
            return 'MacOS Device'
#         elif 'Lenovo' in brRec:
#             return 'Lenovo Device'
#         elif 'Redmi' in brRec:
#             return 'Redmi Device'
        elif ('Moto' in brRec) or ('moto' in brRec):
            return 'Moto Device'
        elif ('Nexus' in brRec) or ('Pixel' in brRec):
            return 'MacOS Device'
#         elif 'HTC' in brRec:
#             return 'HTC Device'
#         elif 'Android' in brRec:
#             return 'Android Device'
#         elif 'Blade' in brRec:
#             return 'Blade Device'
#         elif ('Nokia' in brRec) or ('NOKIA' in brRec):
#             return 'Nokia Device'
#         elif ('ASUS' in brRec) or ('Asus' in brRec):
#             return 'ASUS Device'
        else:
            return 'Other'

In [8]:
def defGetCountryFromDomain(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('.mx' in brRec):
            return 'Mexico email'
        elif '.jp' in brRec:
            return 'Japan email'
        elif '.uk' in brRec:
            return 'UK email'
        elif '.de' in brRec:
            return 'Germany email'
        elif '.es' in brRec:
            return 'Spain email'
        elif '.fr' in brRec:
            return 'France email'
        elif '.com' in brRec:
            return 'Global email'
        elif '.net' in brRec:
            return 'Net email'
        else:
            return 'Other'

In [9]:
combinedData['CountryDomain']=combinedData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))
combinedData['R_emaildomain']=combinedData['R_emaildomain'].fillna('Other')
combinedData['P_emaildomain']=combinedData['P_emaildomain'].fillna('Other')
combinedData['DeviceInfo']=combinedData['DeviceInfo'].apply(lambda x: deviceInfoDef(x))
combinedData['id_33']=combinedData['id_33'].apply(lambda x: screenReso(x))
combinedData['id_31']=combinedData['id_31'].apply(lambda x: browserDef(x))
combinedData['id_30']=combinedData['id_30'].apply(lambda x: deviceDef(x))

In [10]:
nP=np.percentile(combinedData['TransactionAmt'],99)
combinedData['TransactionAmt']=combinedData['TransactionAmt'].apply(lambda x: nP if x >= nP else x)
# sns.distplot(combinedData['TransactionAmt'])

In [11]:
combinedData.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,CountryDomain
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,No info,,,,,,,Other,Other
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,No info,,,,,,,Other,Global email
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,No info,,,,,,,Other,Global email
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,No info,,,,,,,Other,Global email
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,32.0,2K Screen,match_status:2,T,F,T,T,mobile,SAMSUNG Device,Global email


## For C columns

In [12]:
groupC=['C'+str(i) for i in range(1,15)]

In [13]:
from sklearn import decomposition,preprocessing

In [14]:
pp1=preprocessing.StandardScaler()
pp1.fit(combinedData[groupC])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
newGroupCdata=pp1.transform(combinedData[groupC])

In [16]:
pcaGroupC=decomposition.PCA()
pcaGroupC.fit(newGroupCdata)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [17]:
sum(pcaGroupC.explained_variance_ratio_[:2])

0.898930849837729

In [18]:
pcaCColumns=['pcaC1','pcaC2']
pcaForGroupC=pd.DataFrame(pcaGroupC.fit_transform(newGroupCdata)[::,:2],columns=pcaCColumns)

## For D columns

In [19]:
groupD=['D'+str(i) for i in range(1,16)]

In [20]:
for i in groupD:
    print (len(pd.unique(combinedData[i])))

642
642
650
809
689
830
598
12354
25
819
677
636
578
803
860


In [21]:
for i in groupD:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [22]:
pp2=preprocessing.StandardScaler()
pp2.fit(combinedData[groupD])
newGroupDdata=pp2.transform(combinedData[groupD])

In [23]:
pcaGroupD=decomposition.PCA()
pcaGroupD.fit(newGroupDdata)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [24]:
sum(pcaGroupD.explained_variance_ratio_[:9])

0.8652006206715891

In [25]:
pcaDColumns=['pcaD'+str(i) for i in range(1,10)]
pcaForGroupD=pd.DataFrame(pcaGroupD.fit_transform(newGroupDdata)[::,:9],columns=pcaDColumns)

## For Columns V

In [26]:
groupV=['V'+str(i) for i in range(1,340)]

In [27]:
for i in groupV:
    print (len(pd.unique(combinedData[i])))

3
10
11
8
8
11
11
10
10
6
7
5
8
3
9
16
17
17
9
16
7
10
15
15
8
14
5
5
7
9
9
16
8
14
5
7
56
56
17
19
3
10
10
50
50
8
10
7
7
7
8
10
7
8
19
53
8
12
18
18
8
12
9
9
3
9
10
4
7
8
8
12
9
10
6
8
32
33
9
21
21
9
9
9
9
32
32
3
4
7
8
9
9
4
882
1411
977
14
90
30
871
1286
929
17
101
57
3
9
9
9
11
11
11
8
8
8
5
5
5
5
5
5
15
15
15
10300
24415
14508
1969
12333
4445
6561
9950
8179
3725
4853
4253
24
35
35
7
11
871
64
261
26
28
22
22
1997
57
40
20
20
26
26
26
26
6664
9622
80
186
107
1979
2548
988
874
966
21
50
63
33
9
10
16
50
863
1237
922
85
26
85
43
18
33
40
216
32
32
44
23
46
39
9
18
40
16
23
47
47
57
10971
14952
12859
2241
1781
3247
2553
3452
2837
7625
8869
8318
2283
2748
2533
305
402
380
27
78
77
18
80
36
82
51
56
92
67
295
339
334
123
25
47
41
25
25
7
6
22
44
24
59
47
20
24
24
20
20
26
67
46
47
49
50
68
69
10
42
22
10423
13359
11758
2179
3617
2757
152
2341
2788
2508
7178
8316
7777
2264
2541
2399
882
976
24
34
63
14
97
10
33
12
14
59
220
174
871
1287
929
95
14
95
51
13
15
18
22
18
3
16211
37368
2306

In [28]:
for i in groupV:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [29]:
pp3=preprocessing.StandardScaler()
pp3.fit(combinedData[groupV])
newGroupVdata=pp3.transform(combinedData[groupV])

In [30]:
pcaGroupV=decomposition.PCA()
pcaGroupV.fit(newGroupVdata)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [31]:
sum(pcaGroupV.explained_variance_ratio_[:50])

0.8755492354806109

In [32]:
pcaVColumns=['pcaV'+str(i) for i in range(1,51)]
pcaForGroupV=pd.DataFrame(pcaGroupD.fit_transform(newGroupVdata)[::,:50],columns=pcaVColumns)

In [33]:
combinedData.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,CountryDomain
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,No info,,,,,,,Other,Other
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,No info,,,,,,,Other,Global email
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,No info,,,,,,,Other,Global email
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,No info,,,,,,,Other,Global email
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,32.0,2K Screen,match_status:2,T,F,T,T,mobile,SAMSUNG Device,Global email


## Group Columns for ID

In [34]:
groupID=['id_'+str(i).zfill(2) for i in range(1,12)]

In [35]:
for i in groupID:
    print (len(pd.unique(combinedData[i])))

78
115656
25
16
94
102
85
95
47
63
366


In [36]:
for i in groupID:
    combinedData[i]=combinedData[i].fillna(combinedData[i].mean())

In [37]:
pp4=preprocessing.StandardScaler()
pp4.fit(combinedData[groupID])
newGroupIDdata=pp4.transform(combinedData[groupID])

In [38]:
pcaGroupID=decomposition.PCA()
pcaGroupID.fit(newGroupIDdata)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [39]:
sum(pcaGroupID.explained_variance_ratio_[:8])

0.8394878565742937

In [40]:
pcaIDColumns=['pcaID'+str(i) for i in range(1,9)]
pcaForGroupID=pd.DataFrame(pcaGroupID.fit_transform(newGroupIDdata)[::,:8],columns=pcaIDColumns)

## Process of the data

In [41]:
def numDef(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 120:
            return (' Less than 120')
        elif brRec < 140:
            return (' Less than 140')
        elif brRec < 160:
            return (' Less than 160')
        elif brRec < 180:
            return (' Less than 180')
        elif brRec < 200:
            return (' Less than 200')
        elif brRec > 200:
            return (' Greater than 140')
        else:
            return 'other'

In [42]:
def numDef25(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 150:
            return (' Less than 150')
        elif brRec < 200:
            return (' Less than 200')
        elif brRec < 250:
            return (' Less than 250')
        elif brRec < 300:
            return (' Less than 300')
        elif brRec < 400:
            return (' Less than 400')
        elif brRec > 400:
            return (' Greater than 400')
        else:
            return 'other'

In [43]:
def numDef21(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 150:
            return (' Less than 150')
        elif brRec < 200:
            return (' Less than 200')
        elif brRec < 300:
            return (' Less than 300')
        elif brRec < 400:
            return (' Less than 400')
        elif brRec < 600:
            return (' Less than 600')
        elif brRec > 600:
            return (' Greater than 600')
        else:
            return 'other'

In [44]:
def numDef13(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 20:
            return (' Less than 20')
        elif brRec < 40:
            return (' Less than 40')
        elif brRec < 60:
            return (' Less than 60')
        elif brRec < 80:
            return (' Less than 80')
        elif brRec > 80:
            return (' Greater than 80')
        else:
            return 'other'

In [45]:
def numDef14(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < -470:
            return (' Less than -470')
        elif brRec < -290:
            return (' Less than -290')
        elif brRec < 1:
            return (' Less than 1')
        elif brRec < 60:
            return (' Less than 60')
        elif brRec > 60:
            return (' Greater than 60')
        else:
            return 'other'

In [46]:
def numDef22(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 10:
            return ('Less than 10')
        else:
            return ('Greater than 10')

In [47]:
def numDef24(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 12:
            return ('Less than 12')
        else:
            return ('Greater than 12')

In [48]:
def numDef18(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        if brRec < 14:
            return ('Less than 14')
        else:
            return ('Greater than 14')

In [49]:
def numDef32(brRec):
    if str(brRec)=='nan':
        return 'other'
    else:
        return (int(brRec))

In [50]:
def defCard6(brRec):
    if str(brRec)=='nan':
        return 'Other'
    else:
        if ('debit or credit' in brRec):
            return 'debit'
        else:
            return (brRec)

In [51]:

combinedData['id_26']=combinedData['id_26'].apply(lambda x: numDef(x))
combinedData['id_25']=combinedData['id_25'].apply(lambda x: numDef25(x))
combinedData['id_21']=combinedData['id_21'].apply(lambda x: numDef21(x))
combinedData['id_20']=combinedData['id_20'].apply(lambda x: numDef21(x))
combinedData['id_19']=combinedData['id_19'].apply(lambda x: numDef21(x))
combinedData['id_17']=combinedData['id_17'].apply(lambda x: numDef(x))
combinedData['id_13']=combinedData['id_13'].apply(lambda x: numDef13(x))

combinedData['id_14']=combinedData['id_14'].apply(lambda x: numDef14(x))
combinedData['id_22']=combinedData['id_22'].apply(lambda x: numDef22(x))
combinedData['id_24']=combinedData['id_24'].apply(lambda x: numDef24(x))
combinedData['id_18']=combinedData['id_18'].apply(lambda x: numDef18(x))
combinedData['id_32']=combinedData['id_32'].apply(lambda x: numDef32(x))


combinedData['card4']=combinedData['card4'].fillna('Other')
combinedData['card6']=combinedData['card6'].apply(lambda x: defCard6(x))

In [52]:
# combinedData['id_33']=combinedData['id_33'].apply(lambda x: screenReso(x))
# combinedData['id_31']=combinedData['id_31'].apply(lambda x: browserDef(x))
# combinedData['id_30']=combinedData['id_30'].apply(lambda x: deviceDef(x))
# combinedData['id_26']=combinedData['id_26'].apply(lambda x: numDef(x))
# combinedData['id_25']=combinedData['id_25'].apply(lambda x: numDef25(x))
# combinedData['id_21']=combinedData['id_21'].apply(lambda x: numDef21(x))
# combinedData['id_20']=combinedData['id_20'].apply(lambda x: numDef21(x))
# combinedData['id_19']=combinedData['id_19'].apply(lambda x: numDef21(x))
# combinedData['id_17']=combinedData['id_17'].apply(lambda x: numDef(x))
# combinedData['id_13']=combinedData['id_13'].apply(lambda x: numDef13(x))
# combinedData['DeviceInfo']=combinedData['DeviceInfo'].apply(lambda x: deviceInfoDef(x))
# combinedData['id_14']=combinedData['id_14'].apply(lambda x: numDef14(x))
# combinedData['id_22']=combinedData['id_22'].apply(lambda x: numDef22(x))
# combinedData['id_24']=combinedData['id_24'].apply(lambda x: numDef24(x))
# combinedData['id_18']=combinedData['id_18'].apply(lambda x: numDef18(x))
# combinedData['id_32']=combinedData['id_32'].apply(lambda x: numDef32(x))

# combinedData['CountryDomain']=combinedData['P_emaildomain'].apply(lambda x: defGetCountryFromDomain(x))
# combinedData['R_emaildomain']=combinedData['R_emaildomain'].fillna('Other')
# combinedData['P_emaildomain']=combinedData['P_emaildomain'].fillna('Other')
# combinedData['card4']=combinedData['card4'].fillna('Other')
# combinedData['card6']=combinedData['card6'].apply(lambda x: defCard6(x))

In [53]:
pcaIDColumns,pcaVColumns,pcaCColumns,pcaDColumns

(['pcaID1',
  'pcaID2',
  'pcaID3',
  'pcaID4',
  'pcaID5',
  'pcaID6',
  'pcaID7',
  'pcaID8'],
 ['pcaV1',
  'pcaV2',
  'pcaV3',
  'pcaV4',
  'pcaV5',
  'pcaV6',
  'pcaV7',
  'pcaV8',
  'pcaV9',
  'pcaV10',
  'pcaV11',
  'pcaV12',
  'pcaV13',
  'pcaV14',
  'pcaV15',
  'pcaV16',
  'pcaV17',
  'pcaV18',
  'pcaV19',
  'pcaV20',
  'pcaV21',
  'pcaV22',
  'pcaV23',
  'pcaV24',
  'pcaV25',
  'pcaV26',
  'pcaV27',
  'pcaV28',
  'pcaV29',
  'pcaV30',
  'pcaV31',
  'pcaV32',
  'pcaV33',
  'pcaV34',
  'pcaV35',
  'pcaV36',
  'pcaV37',
  'pcaV38',
  'pcaV39',
  'pcaV40',
  'pcaV41',
  'pcaV42',
  'pcaV43',
  'pcaV44',
  'pcaV45',
  'pcaV46',
  'pcaV47',
  'pcaV48',
  'pcaV49',
  'pcaV50'],
 ['pcaC1', 'pcaC2'],
 ['pcaD1',
  'pcaD2',
  'pcaD3',
  'pcaD4',
  'pcaD5',
  'pcaD6',
  'pcaD7',
  'pcaD8',
  'pcaD9'])

In [54]:
categoricalCombinedData=['DeviceType','DeviceInfo']+['id_'+str(i) for i in range(12,39)]+\
                        ['M'+str(i) for i in range(1,10)]+\
                        ['ProductCD','P_emaildomain', 'R_emaildomain','CountryDomain','card4', 'card6',]#+\
                        #+['addr1','card1', 'card2', 'card3','card5', 'addr2']
    
leaveCols=['TransactionID', 'isFraud', 'TransactionDT']+\
            ['addr1','card1', 'card2', 'card3','card5', 'addr2']+\
            [ 'TransactionAmt',]
# tonormalize=['addr1','card1', 'card2', 'card3','card5', 'addr2']

In [55]:
for j in categoricalCombinedData:
    print ('>>>>>>>>>>>',j.zfill(12),'>>>>',len(pd.unique(combinedData[j])))

>>>>>>>>>>> 00DeviceType >>>> 3
>>>>>>>>>>> 00DeviceInfo >>>> 8
>>>>>>>>>>> 0000000id_12 >>>> 3
>>>>>>>>>>> 0000000id_13 >>>> 5
>>>>>>>>>>> 0000000id_14 >>>> 5
>>>>>>>>>>> 0000000id_15 >>>> 4
>>>>>>>>>>> 0000000id_16 >>>> 3
>>>>>>>>>>> 0000000id_17 >>>> 7
>>>>>>>>>>> 0000000id_18 >>>> 3
>>>>>>>>>>> 0000000id_19 >>>> 7
>>>>>>>>>>> 0000000id_20 >>>> 7
>>>>>>>>>>> 0000000id_21 >>>> 7
>>>>>>>>>>> 0000000id_22 >>>> 2
>>>>>>>>>>> 0000000id_23 >>>> 4
>>>>>>>>>>> 0000000id_24 >>>> 3
>>>>>>>>>>> 0000000id_25 >>>> 7
>>>>>>>>>>> 0000000id_26 >>>> 7
>>>>>>>>>>> 0000000id_27 >>>> 3
>>>>>>>>>>> 0000000id_28 >>>> 3
>>>>>>>>>>> 0000000id_29 >>>> 3
>>>>>>>>>>> 0000000id_30 >>>> 7
>>>>>>>>>>> 0000000id_31 >>>> 39
>>>>>>>>>>> 0000000id_32 >>>> 5
>>>>>>>>>>> 0000000id_33 >>>> 5
>>>>>>>>>>> 0000000id_34 >>>> 5
>>>>>>>>>>> 0000000id_35 >>>> 3
>>>>>>>>>>> 0000000id_36 >>>> 3
>>>>>>>>>>> 0000000id_37 >>>> 3
>>>>>>>>>>> 0000000id_38 >>>> 3
>>>>>>>>>>> 0000000000M1 >>>> 3
>>>>>>>>>>> 0000000000M2 >>>> 3
>>>>>>>

In [56]:
# towork='id_18'
# fig, ax = plt.subplots(figsize=(15,5))
# sns.countplot(trainIdentity[towork].fillna(0))
# # sns.countplot(trainIdentity['id_34'].fillna(0),ax=ax)

# trainIdentity['id_18']=trainIdentity['id_18'].apply(lambda x: numDef18(x))
# len(pd.unique(trainIdentity[towork]))
# list(pd.unique(trainIdentity[towork]))

In [57]:
cateDataTrainCombined=pd.get_dummies(combinedData[categoricalCombinedData],prefix_sep='_',drop_first='True')

In [58]:
toleaveTrainCombined=combinedData[leaveCols].fillna(-9999)
cateDataTrainCombined=cateDataTrainCombined.fillna('Other')

In [59]:
proProcess1=preprocessing.MinMaxScaler()
toleaveTrainCombined['addr1']=proProcess1.fit_transform(toleaveTrainCombined[['addr1']])

In [60]:
proProcess2=preprocessing.MinMaxScaler()
toleaveTrainCombined['card1']=proProcess2.fit_transform(toleaveTrainCombined[['card1']])

In [61]:
proProcess3=preprocessing.MinMaxScaler()
toleaveTrainCombined['card2']=proProcess3.fit_transform(toleaveTrainCombined[['card2']])

In [62]:
proProcess4=preprocessing.MinMaxScaler()
toleaveTrainCombined['card3']=proProcess4.fit_transform(toleaveTrainCombined[['card3']])

In [63]:
proProcess5=preprocessing.MinMaxScaler()
toleaveTrainCombined['card5']=proProcess5.fit_transform(toleaveTrainCombined[['card5']])

In [64]:
proProcess6=preprocessing.MinMaxScaler()
toleaveTrainCombined['addr2']=proProcess6.fit_transform(toleaveTrainCombined[['addr2']])

In [65]:
# toleaveTrainCombined[['addr1','card1', 'card2', 'card3','card5', 'addr2']]

In [66]:
# towork='addr2'
# # fig, ax = plt.subplots(figsize=(15,5))
# # sns.countplot(trainTranscation[towork].fillna(0))
# # sns.countplot(trainIdentity['id_34'].fillna(0),ax=ax)

# trainTranscation['card6']=trainTranscation['card6'].apply(lambda x: defCard6(x))
# len(pd.unique(trainTranscation[towork]))
# list(pd.unique(trainTranscation[towork]))

In [67]:
processedTrainTransaction=pd.concat([toleaveTrainCombined,cateDataTrainCombined,pcaForGroupID,pcaForGroupV,pcaForGroupD,pcaForGroupC],axis=1)
processedTrainTransaction.shape

(590540, 352)

In [68]:
tempTimeSer=((processedTrainTransaction['TransactionDT']-86400)).map(int)
processedTrainTransaction['transSec']=tempTimeSer%60
processedTrainTransaction['transMin']=(tempTimeSer/60).map(int)%60
processedTrainTransaction['transHour']=(tempTimeSer/3600).map(int)%24
processedTrainTransaction['transMonth']=((tempTimeSer/86400)//30)+1
processedTrainTransaction['transDay']=((tempTimeSer/86400)%30).map(int)+1

In [69]:
del processedTrainTransaction['TransactionDT']
del processedTrainTransaction['TransactionID']

In [70]:
processedTrainTransaction.tail()

Unnamed: 0,isFraud,addr1,card1,card2,card3,card5,addr2,TransactionAmt,DeviceType_mobile,DeviceInfo_LG Device,...,pcaD7,pcaD8,pcaD9,pcaC1,pcaC2,transSec,transMin,transHour,transMonth,transDay
590535,0,0.974571,0.319039,0.0,0.992082,0.998925,0.998515,49.0,0,0,...,0.3093,0.268705,-0.281415,-0.320605,-0.265991,27,57,23,7.0,2
590536,0,0.968118,0.542883,0.964619,0.992082,0.99873,0.998515,39.5,0,0,...,-0.452493,-0.179757,0.100485,-0.334589,-0.334193,29,57,23,7.0,2
590537,0,0.97068,0.634456,0.999528,0.992082,0.99873,0.998515,30.95,0,0,...,-0.452493,-0.179757,0.100485,-0.332253,-0.310536,59,57,23,7.0,2
590538,0,0.985482,0.392389,0.988773,0.992082,0.99873,0.998515,117.0,0,0,...,-0.081732,-0.153618,-0.036438,-0.311121,-0.286562,8,58,23,7.0,2
590539,0,0.977133,0.808577,0.95943,0.992082,0.986811,0.998515,279.95,0,0,...,-0.604862,-0.470756,0.21831,-0.326218,-0.274053,51,58,23,7.0,2


In [71]:
# sns.countplot(processedTrainTransaction['isFraud'])
# # sns.countplot(processedTrainTransaction['transMonth'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transMonth'])
# sns.countplot(processedTrainTransaction['transMonth'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transDay'])
# sns.countplot(processedTrainTransaction['transDay'],hue=processedTrainTransaction['isFraud'])

# # sns.countplot(processedTrainTransaction['transHour'])
# sns.countplot(processedTrainTransaction['transHour'],hue=processedTrainTransaction['isFraud'])

In [72]:
# sns.countplot(processedTrainTransaction['transMin'])

In [73]:
# sns.countplot(processedTrainTransaction['transSec'])

In [74]:
target='isFraud'
toUseCol=list(processedTrainTransaction.columns)
toUseCol.remove(target)

In [75]:
from collections import Counter 
Counter(toUseCol)

len(toUseCol),len(set(toUseCol))

(354, 354)

In [77]:
trainPart=processedTrainTransaction[processedTrainTransaction['transMonth']<=5]
# valPart=processedTrainTransaction[processedTrainTransaction['transMonth']==5]
testPart=processedTrainTransaction[processedTrainTransaction['transMonth']>5]
trainPart.shape,testPart.shape#,valPart.shape

((498113, 355), (92427, 355))

In [78]:
from sklearn import ensemble,metrics
import xgboost as xgb

In [79]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = trainPart[toUseCol]
y = trainPart[target]
tscv = TimeSeriesSplit(n_splits=5)
tscv

TimeSeriesSplit(max_train_size=None, n_splits=5)

In [80]:
clfList=[]
scoreList=[]
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10,'max_depth': 2}
    
    clf = xgb.XGBClassifier(
        n_estimators=300, random_state=4,
#         tree_method='gpu_hist',
        **params
    )
    
    clf.fit(X_train, y_train)
    
    y_pred_train = clf.predict_proba(X_test)[:,1]
    score = metrics.roc_auc_score(y_test, y_pred_train)
    clfList.append(clf)
    scoreList.append(y_pred_train)
    print(f'ROC AUC {score}')

TRAIN: (83023,) TEST: (83018,)
ROC AUC 0.8732250087610154
TRAIN: (166041,) TEST: (83018,)
ROC AUC 0.86290466960646
TRAIN: (249059,) TEST: (83018,)
ROC AUC 0.8741850771916289
TRAIN: (332077,) TEST: (83018,)
ROC AUC 0.8732050994607439
TRAIN: (415095,) TEST: (83018,)
ROC AUC 0.8591511940088825


In [81]:
scoreNewMean=pd.DataFrame(scoreList).transpose().mean(axis=1)

In [82]:
y_pred_train

array([0.00978512, 0.97844803, 0.03567712, ..., 0.00565478, 0.01259616,
       0.00455979], dtype=float32)

In [83]:
metrics.roc_auc_score( y_test,scoreNewMean)

0.7613150724952066

In [84]:
testScoreList=[]
for mo in clfList:
    scr=mo.predict_proba(testPart[toUseCol])[:,1]
    testScoreList.append(scr)

In [85]:
scoreTestNewMean=pd.DataFrame(testScoreList).transpose().mean(axis=1)

In [86]:
metrics.roc_auc_score( testPart[target],scoreTestNewMean)

0.8584773041340792

In [None]:
# lgb_params = {
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
# #         'metric':metrics,
#         'learning_rate': 0.01,
#         #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
#         'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
#         'max_depth': -1,  # -1 means no limit
#         'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
#         'max_bin': 255,  # Number of bucketed bin for feature values
#         'subsample': 0.6,  # Subsample ratio of the training instance.
#         'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
#         'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
#         'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#         'subsample_for_bin': 200000,  # Number of samples for constructing bin
#         'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
#         'reg_alpha': 0,  # L1 regularization term on weights
#         'reg_lambda': 0,  # L2 regularization term on weights
#         'nthread': 4,
#         'verbose': 0,
#         'metric':'auc'
#     }

# num_boost_round=3000
# early_stopping_rounds=10

# model3 = lgb.train(lgb_params, 
#                      trainData, 
#                      valid_sets=[trainData, valData], 
#                    valid_names=['train','valid'],
# #                      evals_result=evals_results, 
#                      num_boost_round=num_boost_round,
#                      early_stopping_rounds=early_stopping_rounds,
#                      verbose_eval=10)

In [None]:
predTrain3=bst.predict(trainPart[toUseCol])
scoreOftrain3=metrics.roc_auc_score(trainPart[target],predTrain3)
valPred3=bst.predict(valPart[toUseCol])
scoreOfVal3=metrics.roc_auc_score(valPart[target],valPred3)
testPred3=bst.predict(testPart[toUseCol])
scoreOfTest3=metrics.roc_auc_score(testPart[target],testPred3)
print('ROC AUC for train {} and for validation {} for test {}'.format(scoreOftrain3,scoreOfVal3,scoreOfTest3))

In [None]:
# ROC AUC for train 0.9543954336563735 and for validation 0.9114576197806739 for test 0.9040375652381945