In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

In [4]:
train = pd.read_csv('training.csv')

In [5]:
train.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [6]:
y_train = train['FraudResult']

In [7]:
train.shape

(95662, 16)

In [8]:
ntrain = train.shape[0]

In [9]:
test = pd.read_csv('test.csv')

In [10]:
testId = test['TransactionId']

In [11]:
test.shape

(45019, 15)

In [12]:
ntest = test.shape[0]

In [13]:
all_data = pd.concat((train, test), sort=False).reset_index(drop=True)

In [14]:
Ids_fr = ['TransactionId','BatchId','AccountId','SubscriptionId','CustomerId','ProviderId','ProductId','TransactionStartTime','FraudResult','ChannelId']

In [15]:
X = all_data.drop(Ids_fr,axis=1)

In [16]:
X.head()

Unnamed: 0,CurrencyCode,CountryCode,ProductCategory,Amount,Value,PricingStrategy
0,UGX,256,airtime,1000.0,1000,2
1,UGX,256,financial_services,-20.0,20,2
2,UGX,256,airtime,500.0,500,2
3,UGX,256,utility_bill,20000.0,21800,2
4,UGX,256,financial_services,-644.0,644,2


In [17]:
X_transformed = pd.get_dummies(X)

In [18]:
X_transformed=X_transformed.fillna(0)

In [19]:
 X_transformed.head()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_retail,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,256,1000.0,1000,2,1,1,0,0,0,0,0,0,0,0,0
1,256,-20.0,20,2,1,0,0,1,0,0,0,0,0,0,0
2,256,500.0,500,2,1,1,0,0,0,0,0,0,0,0,0
3,256,20000.0,21800,2,1,0,0,0,0,0,0,0,0,0,1
4,256,-644.0,644,2,1,0,0,1,0,0,0,0,0,0,0


In [20]:
X_train = X_transformed[:ntrain]
X_test = X_transformed[ntrain:]

In [21]:
X_train.shape

(95662, 15)

In [22]:
X_test.shape

(45019, 15)

## Logistic Regression

In [41]:
lr = LogisticRegression()

In [42]:
lr.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
y_pred = lr.predict(X_test)

In [44]:
submission_lr = pd.DataFrame({'TransactionId':testId,'FraudResult':y_pred}).set_index('TransactionId')

In [45]:
submission_lr.to_csv('submission_lr.csv')

## Random Forest

In [56]:
rfr = RandomForestClassifier()

In [57]:
rfr.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [58]:
y_pred_rfr = rfr.predict(X_test)

In [59]:
submission_rfr10 = pd.DataFrame({'TransactionId':testId,'FraudResult':y_pred}).set_index('TransactionId')

In [60]:
submission_rfr10.to_csv('submission_rfr10.csv')

## CatBoost

In [34]:
cb = CatBoostClassifier()

In [35]:
cb.fit(X_train,y_train)

Learning rate set to 0.060908
0:	learn: 0.4447895	total: 162ms	remaining: 2m 42s
1:	learn: 0.2888936	total: 208ms	remaining: 1m 44s
2:	learn: 0.1885484	total: 256ms	remaining: 1m 25s
3:	learn: 0.1214680	total: 300ms	remaining: 1m 14s
4:	learn: 0.0794621	total: 343ms	remaining: 1m 8s
5:	learn: 0.0538059	total: 398ms	remaining: 1m 5s
6:	learn: 0.0370039	total: 447ms	remaining: 1m 3s
7:	learn: 0.0263490	total: 513ms	remaining: 1m 3s
8:	learn: 0.0190763	total: 569ms	remaining: 1m 2s
9:	learn: 0.0142447	total: 621ms	remaining: 1m 1s
10:	learn: 0.0109232	total: 666ms	remaining: 59.9s
11:	learn: 0.0086598	total: 709ms	remaining: 58.4s
12:	learn: 0.0070917	total: 758ms	remaining: 57.6s
13:	learn: 0.0059604	total: 809ms	remaining: 57s
14:	learn: 0.0051437	total: 863ms	remaining: 56.6s
15:	learn: 0.0046241	total: 957ms	remaining: 58.8s
16:	learn: 0.0041395	total: 1.07s	remaining: 1m 1s
17:	learn: 0.0037609	total: 1.14s	remaining: 1m 1s
18:	learn: 0.0034708	total: 1.22s	remaining: 1m 2s
19:	learn

164:	learn: 0.0017471	total: 8.99s	remaining: 45.5s
165:	learn: 0.0017466	total: 9.03s	remaining: 45.4s
166:	learn: 0.0017465	total: 9.08s	remaining: 45.3s
167:	learn: 0.0017458	total: 9.12s	remaining: 45.2s
168:	learn: 0.0017453	total: 9.17s	remaining: 45.1s
169:	learn: 0.0017449	total: 9.21s	remaining: 45s
170:	learn: 0.0017444	total: 9.26s	remaining: 44.9s
171:	learn: 0.0017443	total: 9.3s	remaining: 44.8s
172:	learn: 0.0017440	total: 9.35s	remaining: 44.7s
173:	learn: 0.0017435	total: 9.39s	remaining: 44.6s
174:	learn: 0.0017433	total: 9.44s	remaining: 44.5s
175:	learn: 0.0017423	total: 9.48s	remaining: 44.4s
176:	learn: 0.0017423	total: 9.52s	remaining: 44.3s
177:	learn: 0.0017420	total: 9.57s	remaining: 44.2s
178:	learn: 0.0017415	total: 9.62s	remaining: 44.1s
179:	learn: 0.0017409	total: 9.66s	remaining: 44s
180:	learn: 0.0017404	total: 9.71s	remaining: 43.9s
181:	learn: 0.0017401	total: 9.78s	remaining: 43.9s
182:	learn: 0.0017397	total: 9.84s	remaining: 43.9s
183:	learn: 0.001

324:	learn: 0.0017093	total: 17s	remaining: 35.4s
325:	learn: 0.0017093	total: 17.1s	remaining: 35.3s
326:	learn: 0.0017091	total: 17.1s	remaining: 35.3s
327:	learn: 0.0017091	total: 17.2s	remaining: 35.2s
328:	learn: 0.0017086	total: 17.2s	remaining: 35.2s
329:	learn: 0.0017085	total: 17.3s	remaining: 35.1s
330:	learn: 0.0017085	total: 17.4s	remaining: 35.1s
331:	learn: 0.0017083	total: 17.4s	remaining: 35.1s
332:	learn: 0.0017082	total: 17.5s	remaining: 35s
333:	learn: 0.0017082	total: 17.5s	remaining: 35s
334:	learn: 0.0017082	total: 17.6s	remaining: 34.9s
335:	learn: 0.0017081	total: 17.6s	remaining: 34.8s
336:	learn: 0.0017080	total: 17.7s	remaining: 34.8s
337:	learn: 0.0017079	total: 17.7s	remaining: 34.7s
338:	learn: 0.0017075	total: 17.8s	remaining: 34.6s
339:	learn: 0.0017074	total: 17.8s	remaining: 34.6s
340:	learn: 0.0017073	total: 17.9s	remaining: 34.5s
341:	learn: 0.0017073	total: 17.9s	remaining: 34.5s
342:	learn: 0.0017072	total: 18s	remaining: 34.4s
343:	learn: 0.001707

483:	learn: 0.0016985	total: 24.6s	remaining: 26.3s
484:	learn: 0.0016984	total: 24.7s	remaining: 26.2s
485:	learn: 0.0016984	total: 24.7s	remaining: 26.2s
486:	learn: 0.0016984	total: 24.8s	remaining: 26.1s
487:	learn: 0.0016983	total: 24.8s	remaining: 26s
488:	learn: 0.0016982	total: 24.9s	remaining: 26s
489:	learn: 0.0016982	total: 24.9s	remaining: 25.9s
490:	learn: 0.0016981	total: 25s	remaining: 25.9s
491:	learn: 0.0016981	total: 25s	remaining: 25.8s
492:	learn: 0.0016981	total: 25s	remaining: 25.8s
493:	learn: 0.0016981	total: 25.1s	remaining: 25.7s
494:	learn: 0.0016975	total: 25.1s	remaining: 25.6s
495:	learn: 0.0016975	total: 25.2s	remaining: 25.6s
496:	learn: 0.0016975	total: 25.2s	remaining: 25.5s
497:	learn: 0.0016974	total: 25.3s	remaining: 25.5s
498:	learn: 0.0016974	total: 25.3s	remaining: 25.4s
499:	learn: 0.0016974	total: 25.3s	remaining: 25.3s
500:	learn: 0.0016973	total: 25.4s	remaining: 25.3s
501:	learn: 0.0016973	total: 25.4s	remaining: 25.2s
502:	learn: 0.0016972	

645:	learn: 0.0016932	total: 31.8s	remaining: 17.4s
646:	learn: 0.0016932	total: 31.8s	remaining: 17.4s
647:	learn: 0.0016931	total: 31.9s	remaining: 17.3s
648:	learn: 0.0016931	total: 31.9s	remaining: 17.3s
649:	learn: 0.0016931	total: 32s	remaining: 17.2s
650:	learn: 0.0016931	total: 32s	remaining: 17.2s
651:	learn: 0.0016930	total: 32s	remaining: 17.1s
652:	learn: 0.0016930	total: 32.1s	remaining: 17s
653:	learn: 0.0016930	total: 32.1s	remaining: 17s
654:	learn: 0.0016930	total: 32.2s	remaining: 16.9s
655:	learn: 0.0016930	total: 32.2s	remaining: 16.9s
656:	learn: 0.0016930	total: 32.2s	remaining: 16.8s
657:	learn: 0.0016930	total: 32.3s	remaining: 16.8s
658:	learn: 0.0016930	total: 32.3s	remaining: 16.7s
659:	learn: 0.0016929	total: 32.4s	remaining: 16.7s
660:	learn: 0.0016929	total: 32.4s	remaining: 16.6s
661:	learn: 0.0016929	total: 32.4s	remaining: 16.6s
662:	learn: 0.0016929	total: 32.5s	remaining: 16.5s
663:	learn: 0.0016929	total: 32.5s	remaining: 16.5s
664:	learn: 0.0016929	

806:	learn: 0.0016899	total: 38.8s	remaining: 9.28s
807:	learn: 0.0016898	total: 38.9s	remaining: 9.23s
808:	learn: 0.0016898	total: 38.9s	remaining: 9.18s
809:	learn: 0.0016898	total: 39s	remaining: 9.14s
810:	learn: 0.0016898	total: 39s	remaining: 9.09s
811:	learn: 0.0016897	total: 39s	remaining: 9.04s
812:	learn: 0.0016897	total: 39.1s	remaining: 8.99s
813:	learn: 0.0016897	total: 39.1s	remaining: 8.94s
814:	learn: 0.0016896	total: 39.2s	remaining: 8.89s
815:	learn: 0.0016896	total: 39.2s	remaining: 8.84s
816:	learn: 0.0016895	total: 39.2s	remaining: 8.79s
817:	learn: 0.0016895	total: 39.3s	remaining: 8.74s
818:	learn: 0.0016895	total: 39.3s	remaining: 8.69s
819:	learn: 0.0016895	total: 39.4s	remaining: 8.64s
820:	learn: 0.0016894	total: 39.4s	remaining: 8.59s
821:	learn: 0.0016894	total: 39.4s	remaining: 8.54s
822:	learn: 0.0016894	total: 39.5s	remaining: 8.49s
823:	learn: 0.0016894	total: 39.5s	remaining: 8.44s
824:	learn: 0.0016893	total: 39.6s	remaining: 8.39s
825:	learn: 0.0016

966:	learn: 0.0016853	total: 46.6s	remaining: 1.59s
967:	learn: 0.0016853	total: 46.6s	remaining: 1.54s
968:	learn: 0.0016852	total: 46.6s	remaining: 1.49s
969:	learn: 0.0016852	total: 46.7s	remaining: 1.44s
970:	learn: 0.0016852	total: 46.7s	remaining: 1.4s
971:	learn: 0.0016852	total: 46.8s	remaining: 1.35s
972:	learn: 0.0016852	total: 46.8s	remaining: 1.3s
973:	learn: 0.0016851	total: 46.9s	remaining: 1.25s
974:	learn: 0.0016851	total: 47s	remaining: 1.2s
975:	learn: 0.0016851	total: 47s	remaining: 1.16s
976:	learn: 0.0016851	total: 47.1s	remaining: 1.11s
977:	learn: 0.0016850	total: 47.1s	remaining: 1.06s
978:	learn: 0.0016850	total: 47.2s	remaining: 1.01s
979:	learn: 0.0016849	total: 47.2s	remaining: 964ms
980:	learn: 0.0016849	total: 47.3s	remaining: 916ms
981:	learn: 0.0016849	total: 47.3s	remaining: 867ms
982:	learn: 0.0016848	total: 47.4s	remaining: 819ms
983:	learn: 0.0016848	total: 47.4s	remaining: 771ms
984:	learn: 0.0016848	total: 47.5s	remaining: 723ms
985:	learn: 0.00168

<catboost.core.CatBoostClassifier at 0x1c5f4abc208>

In [36]:
y_pred_cb = cb.predict(X_test)

In [37]:
submission_cb = pd.DataFrame({'TransactionId':testId,'FraudResult':y_pred_cb}).set_index('TransactionId')

In [61]:
submission_cb.to_csv('submission_cb.csv')