In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
import seaborn as sns
import warnings
import sklearn
from sklearn import impute

In [3]:
train_transaction = pd.read_csv('/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/train_identity.csv")
test_transaction = pd.read_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/test_transaction.csv")
test_identity = pd.read_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/test_identity.csv")

train_data = pd.merge(train_transaction, train_identity, on = 'TransactionID', how = 'left')
test_data = pd.merge(test_transaction, test_identity, on = 'TransactionID', how = 'left')


train_fraud = train_data[train_data.isFraud == 1]
train_nofraud = train_data[train_data.isFraud == 0]

In [4]:
test_data.columns = test_data.columns.str.replace('-','_')

## Memory usage reduction

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024 ** 2 # just added 
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df

In [6]:
reduce_mem_usage(train_data)

Mem. usage decreased from 2598.36 Mb to 1288.96 Mb (50.4% reduction)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.500000,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.000000,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.000000,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.000000,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.000000,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.000000,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.500000,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.953125,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.000000,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


## Delete columns with too much missing data or repetition

In [7]:
def get_too_many_null_attr(data):
    many_null_cols = [col for col in data.columns if data[col].isnull().sum() / data.shape[0] > 0.9]
    return many_null_cols

In [8]:
def get_too_many_repeated_val(data):
    big_top_value_cols = [col for col in train_data.columns if train_data[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
    return big_top_value_cols

In [9]:
def get_useless_columns(data):
    too_many_null = get_too_many_null_attr(data)
    print("More than 90% null: " + str(len(too_many_null)))
    too_many_repeated = get_too_many_repeated_val(data)
    print("More than 90% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(too_many_null + too_many_repeated))
    cols_to_drop.remove('isFraud')
    return cols_to_drop

In [10]:
cols_to_drop = get_useless_columns(train_data)

More than 90% null: 12
More than 90% repeated value: 67


In [11]:
cols_to_drop = get_useless_columns(train_data)

More than 90% null: 12
More than 90% repeated value: 67


In [12]:
train_data = train_data.drop(cols_to_drop, axis = 1)

In [13]:
test_data = test_data.drop(cols_to_drop, axis = 1)

## Separate categorical and numerical features

In [14]:
num_cols = list(train_data.select_dtypes(include = ['int64', 'int8', 'int16', 'int32', 'float32', 'float16', 'float64']).columns)

num_cols_test = num_cols

num_cols_test.remove('isFraud')

num_cols_test

['TransactionID',
 'TransactionDT',
 'TransactionAmt',
 'card1',
 'card2',
 'card3',
 'card5',
 'addr1',
 'addr2',
 'dist1',
 'C1',
 'C2',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V8

In [15]:
cat_cols = list(train_data.select_dtypes(include=['object']).columns)

cat_cols

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_15',
 'id_16',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'DeviceType',
 'DeviceInfo']

## We replace infinite values with 999

In [16]:
train_data = train_data.replace(np.inf,999)
test_data = test_data.replace(np.inf,999)

## We impute missing values with median (in case of numerical features) or mode (in case of categorical features)

In [17]:
from sklearn.impute import SimpleImputer, KNNImputer


imp_median = SimpleImputer(missing_values = np.nan, strategy = 'median')
imp_mode = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')



### Train data / NaN imputation


In [18]:
imp_mode.fit(train_data[cat_cols])

SimpleImputer(strategy='most_frequent')

In [19]:
imp_median.fit(train_data[num_cols])

SimpleImputer(strategy='median')

In [20]:
train_data[cat_cols] = imp_mode.transform(train_data[cat_cols])



In [21]:
train_data[num_cols] = imp_median.transform(train_data[num_cols])

### Test data / NaN imputation


In [22]:
imp_mode.fit(test_data[cat_cols])

SimpleImputer(strategy='most_frequent')

In [23]:
imp_median.fit(test_data[num_cols_test])

SimpleImputer(strategy='median')

In [24]:
test_data[cat_cols] = imp_mode.transform(test_data[cat_cols])

In [25]:
test_data[num_cols_test] = imp_median.transform(test_data[num_cols_test])

In [26]:
summary = pd.DataFrame(train_data[num_cols].dtypes, columns = ['Data Type'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary.drop(columns=['index'])
summary['Missing(%)'] = train_data[num_cols].isnull().sum().values / train_data[num_cols].shape[0] * 100
summary['unique values'] = train_data[num_cols].nunique().values
summary


Unnamed: 0,Data Type,Name,Missing(%),unique values
0,float64,TransactionID,0.0,590540
1,float64,TransactionDT,0.0,573349
2,float64,TransactionAmt,0.0,8195
3,float64,card1,0.0,13553
4,float64,card2,0.0,500
...,...,...,...,...
333,float64,id_14,0.0,25
334,float64,id_17,0.0,104
335,float64,id_19,0.0,522
336,float64,id_20,0.0,394


In [27]:
train_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000.0,0,86400.0,68.5,W,13926.0,361.0,150.0,discover,142.0,...,chrome 63.0,24.0,1920x1080,match_status:2,T,F,T,F,desktop,Windows
1,2987001.0,0,86401.0,29.0,W,2755.0,404.0,150.0,mastercard,102.0,...,chrome 63.0,24.0,1920x1080,match_status:2,T,F,T,F,desktop,Windows
2,2987002.0,0,86469.0,59.0,W,4663.0,490.0,150.0,visa,166.0,...,chrome 63.0,24.0,1920x1080,match_status:2,T,F,T,F,desktop,Windows
3,2987003.0,0,86499.0,50.0,W,18132.0,567.0,150.0,mastercard,117.0,...,chrome 63.0,24.0,1920x1080,match_status:2,T,F,T,F,desktop,Windows
4,2987004.0,0,86506.0,50.0,H,4497.0,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## Encoding train data


In [28]:
from sklearn import preprocessing
import category_encoders

X_le = train_data.copy()
for f in train_data.columns:
    if X_le[f].dtype == 'object': 
        le = preprocessing.LabelEncoder()
        le.fit(list(X_le[f].values))
        X_le[f] = le.transform(list(X_le[f].values))

X_be = train_data.copy()
for f in train_data.columns:
    if X_be[f].dtype == 'object': 
        if X_be[f].nunique() <= 2:
            le = preprocessing.LabelEncoder()
            le.fit(list(X_be[f].values))
            X_be[f] = le.transform(list(X_be[f].values))
        else:
            be = category_encoders.BinaryEncoder(cols=f)
            X_be = be.fit_transform(X_be)

## Encoding test data

In [29]:
X_le_test = test_data.copy()
for f in test_data.columns:
    if X_le_test[f].dtype == 'object': 
        le = preprocessing.LabelEncoder()
        le.fit(list(X_le_test[f].values))
        X_le_test[f] = le.transform(list(X_le_test[f].values))

X_be_test = test_data.copy()
for f in test_data.columns:
    if X_be_test[f].dtype == 'object': 
        if X_be_test[f].nunique() <= 2:
            le = preprocessing.LabelEncoder()
            le.fit(list(X_be_test[f].values))
            X_be_test[f] = le.transform(list(X_be_test[f].values))
        else:
            be = category_encoders.BinaryEncoder(cols=f)
            X_be_test = be.fit_transform(X_be_test)

In [30]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [31]:
X_le.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000.0,0,86400.0,68.5,4,13926.0,361.0,150.0,1,142.0,...,47,24.0,132,3,1,0,1,0,0,1598
1,2987001.0,0,86401.0,29.0,4,2755.0,404.0,150.0,2,102.0,...,47,24.0,132,3,1,0,1,0,0,1598
2,2987002.0,0,86469.0,59.0,4,4663.0,490.0,150.0,3,166.0,...,47,24.0,132,3,1,0,1,0,0,1598
3,2987003.0,0,86499.0,50.0,4,18132.0,567.0,150.0,2,117.0,...,47,24.0,132,3,1,0,1,0,0,1598
4,2987004.0,0,86506.0,50.0,1,4497.0,514.0,150.0,2,102.0,...,123,32.0,164,3,1,0,1,1,1,954


In [32]:
X_le_test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549.0,18403224.0,31.95,4,10409.0,111.0,150.0,3,226.0,2,...,53,24.0,171,1,1,0,1,0,0,2029
1,3663550.0,18403263.0,49.0,4,4272.0,111.0,150.0,3,226.0,2,...,53,24.0,171,1,1,0,1,0,0,2029
2,3663551.0,18403310.0,171.0,4,4476.0,574.0,150.0,3,226.0,2,...,53,24.0,171,1,1,0,1,0,0,2029
3,3663552.0,18403310.0,284.95,4,10989.0,360.0,150.0,3,166.0,2,...,53,24.0,171,1,1,0,1,0,0,2029
4,3663553.0,18403317.0,67.95,4,18018.0,452.0,150.0,2,117.0,2,...,53,24.0,171,1,1,0,1,0,0,2029


In [33]:
x_le = X_le.drop('isFraud', axis = 1 )
y_le = X_le['isFraud']

x_be = X_be.drop('isFraud', axis = 1)
y_be = X_be['isFraud']

In [34]:
x_le.shape

(590540, 367)

## Loading the test data set and creating submisssion dataframe

In [35]:
test_transaction = pd.read_csv('/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/test_transaction.csv')
submission = pd.DataFrame(test_transaction["TransactionID"])
submission_RF = pd.DataFrame(test_transaction["TransactionID"])
submission_be = pd.DataFrame(test_transaction["TransactionID"])
submission_RF_be = pd.DataFrame(test_transaction["TransactionID"])
del test_transaction

## Creating Y_train, X_train

In [None]:
pd.set_option('display.max_rows', 400)

In [37]:
Y_train = train_data["isFraud"]
X_train = train_data.drop("isFraud", axis = 1)
Xle_train = X_le.drop("isFraud", axis = 1)
Xbe_train = X_be.drop("isFraud", axis = 1)

In [37]:
X_train.dtypes

TransactionID     float64
TransactionDT     float64
TransactionAmt    float64
ProductCD          object
card1             float64
card2             float64
card3             float64
card4              object
card5             float64
card6              object
addr1             float64
addr2             float64
dist1             float64
P_emaildomain      object
R_emaildomain      object
C1                float64
C2                float64
C4                float64
C5                float64
C6                float64
C7                float64
C8                float64
C9                float64
C10               float64
C11               float64
C12               float64
C13               float64
C14               float64
D1                float64
D2                float64
D3                float64
D4                float64
D5                float64
D6                float64
D8                float64
D9                float64
D10               float64
D11               float64
D12         

In [38]:
train_data.dtypes

TransactionID     float64
isFraud              int8
TransactionDT     float64
TransactionAmt    float64
ProductCD          object
card1             float64
card2             float64
card3             float64
card4              object
card5             float64
card6              object
addr1             float64
addr2             float64
dist1             float64
P_emaildomain      object
R_emaildomain      object
C1                float64
C2                float64
C4                float64
C5                float64
C6                float64
C7                float64
C8                float64
C9                float64
C10               float64
C11               float64
C12               float64
C13               float64
C14               float64
D1                float64
D2                float64
D3                float64
D4                float64
D5                float64
D6                float64
D8                float64
D9                float64
D10               float64
D11         

## PCA

In [90]:
from sklearn.decomposition import PCA
pca = PCA()
x_pca = pd.DataFrame(pca.fit_transform(x_le))
x_pca.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,423,424,425,426,427,428,429,430,431,432
0,-7291825.0,-4784.363056,-19552.738405,2112.186754,-27246.536504,-4004.573547,206.073073,13.572158,-250.265983,-735.641398,...,0.000198,-0.00012,1.9e-05,0.000121,-3e-05,6e-05,-0.000158,-4e-06,7.3e-05,-2.042707e-05
1,-7291824.0,-4782.518486,-19556.412829,1772.174087,-27223.253503,7162.11621,172.333816,-39.229217,-195.920422,-717.939808,...,4.3e-05,-0.00015,1.8e-05,0.000118,0.000131,-0.000107,2.5e-05,-3.6e-05,3.8e-05,-1.464809e-05
2,-7291756.0,-4782.722398,-19555.711392,1808.206588,-27229.816703,5254.59382,166.012685,-46.152792,-205.222031,-719.583724,...,0.000142,-9.4e-05,5.5e-05,0.000273,0.00015,-7e-05,5.1e-05,-7.8e-05,6.7e-05,-2.804844e-05
3,-7291726.0,-4817.523297,-19552.848894,4510.55536,-27132.598806,-8176.913738,1499.589923,1475.122888,42.374934,-857.297017,...,-0.001125,-0.000681,0.000628,0.000119,0.000189,3e-06,-1.3e-05,-0.000655,-0.000809,-2.307865e-05
4,-7292075.0,-71846.484613,146341.408423,741.795584,-21874.557966,5439.527972,153.460947,-113.10365,99.366687,6220.953007,...,-0.000483,0.001004,0.000238,-0.000139,-0.000169,0.000156,0.000615,0.000257,-0.000662,-7.635384e-07


In [44]:
x_test_pca = pd.DataFrame(pca.fit_transform(X_le_test.iloc[0:1000]))

In [None]:
my_var = pca.explained_variance_ratio_
my_var
sum(my_var[0:10])

In [91]:
pca_xtrain = x_pca.iloc[:,0:10]
pca_ytrain = y_le
pca_test = x_test_pca.iloc[:,10]

## Logistic regression

In [43]:
lr = LogisticRegression(penalty='l2', max_iter=500, n_jobs=6, tol=1e-6, solver="sag")
lr.fit(x_le, np.ravel(y_le))




LogisticRegression(max_iter=500, n_jobs=6, solver='sag', tol=1e-06)

In [44]:
Yhat_lr = lr.predict_proba(X_le_test)
submission["isFraud"] = Yhat_lr[:, 1]


In [45]:
submission.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionLR.csv", index = False)

## Random forest

### Parameters

In [None]:
nFeatures = x_le.shape[1]
oobErrList = list()
mList = [m for m in range(10, nFeatures+1, 30)]

for m in mList:
    rf = RandomForestClassifier(max_features=m, min_samples_leaf=1,\
                                oob_score=True, n_estimators=50)
    rf.fit(x_le, np.ravel(y_le))
    oobErrList.append(1-rf.oob_score_)
    print(m, 1-rf.oob_score_)

10 0.02025773021302535


In [40]:
rfauto = RandomForestClassifier(max_features="auto", min_samples_leaf=1, n_estimators=1000)
rfauto.fit(x_le, np.ravel(y_le))
Yhat_rfauto = rfauto.predict_proba(X_le_test)




NameError: name 'Yhat_rf' is not defined

In [None]:
treeCount = 1000
m = 15

rf = RandomForestClassifier(max_features=m, min_samples_leaf=1, n_estimators=treeCount)
rf.fit(x_le, np.ravel(y_le))


In [1]:
Yhat_rf = rf.predict_proba(X_le_test)

NameError: name 'rf' is not defined

In [41]:
submission_RF["isFraud"] = Yhat_rfauto[:, 1]
submission_RF.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionRFauto.csv", index = False)


In [None]:
submission_RF["isFraud"] = Yhat_rf[:, 1]

In [None]:
submission

In [None]:
submission_RF.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionRF.csv", index = False)

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#instanciation
knn = KNeighborsClassifier(n_neighbors=600)
#training
#for i in range (300, 1200, 300):
knn.fit(x_le.iloc[0:3000], y_le.iloc[0:3000])

#prediction 
prediction = pd.DataFrame(knn.predict(X_le_test))


In [None]:
prediction.describe()

## models with pCA

In [None]:
lr = LogisticRegression(penalty='l2', max_iter=500, n_jobs=6, tol=1e-6, solver="sag")
lr.fit(pca_xtrain, np.ravel(y_le))

In [93]:
treeCount = 1000
m = 15

rf = RandomForestClassifier(max_features=10, min_samples_leaf=1, n_estimators=treeCount)
rf.fit(pca_xtrain, np.ravel(pca_ytrain))

Yhat_pca_rf = lr.predict_proba(pca_test)
submission["isFraud"] = Yhat_pca_rf[:, 1]

ValueError: Expected 2D array, got 1D array instead:
array=[ 3.97793526e+02  4.15590066e+02  5.00897341e+01  7.11441978e+01
 -4.08246984e+02 -2.97935420e+02 -4.48034698e+02 -2.05121932e+02
  5.12666460e+02 -1.85680726e+02 -2.46278909e+02 -3.59339924e+01
 -3.02513968e+02 -1.50446329e+02 -1.37112392e+02 -2.40833917e+02
 -1.69412836e+02  2.77350403e+02 -2.96348492e+02  1.13942224e+02
 -6.49869339e+00  4.77548944e+01  3.67570649e+02  3.96052185e+02
 -2.40656186e+02  1.78892845e+02  2.04060605e+01 -2.93813384e+02
  3.95173921e+02 -4.59395883e+01 -9.73269334e+01 -2.49232244e+02
 -1.05597941e+02 -2.22692502e+02 -9.71775497e+01 -2.97680536e+01
 -1.00060883e+02 -2.08714905e+02 -3.47356837e+02  1.03394542e+01
 -3.87401622e+02  8.41348912e+02 -2.81948002e+02 -6.43033034e+01
 -5.51208591e+02  1.37226446e+02  6.98926122e+02 -3.01966737e+02
 -2.29330009e+02 -2.80555591e+02  6.63801487e+02 -1.00178114e+02
 -3.14578458e+01 -8.92910667e+00  6.61665346e+02  6.54420140e+02
 -3.22172837e+01 -5.61092996e+02  1.79634593e+02 -1.68511945e+02
 -3.31025144e+02 -2.02323906e+02  8.74484105e+02  4.88719422e+02
 -2.20689394e+02 -2.47817406e+02 -1.94208518e+02 -5.89912333e+00
 -1.31006704e+02  1.50334518e+02  3.13309199e+02 -3.30475971e+02
  4.19771297e+02 -2.77051223e+02 -3.13961103e+01 -2.68031283e+02
  6.76362982e+02 -2.22716470e+02 -7.04581043e+01 -1.44553601e+02
 -1.67096207e+02 -2.75805558e+02 -5.95452783e+01 -2.34648877e+02
  4.40215317e+02 -1.61113833e+02  6.58886288e+02  6.41336307e+01
 -1.09372327e+02  6.75365695e+02 -1.84523033e+02 -9.37309481e+01
  9.45489426e+01  4.55046988e+01 -2.71946159e+02 -1.34218716e+02
  3.44423638e+01 -9.66745198e+01  3.17070610e+02 -3.11980795e+02
 -2.13958808e+02 -1.00868803e+02 -1.13304478e+02  4.70764756e+02
 -2.77127534e+02 -2.24158437e+02 -4.38815198e+02  3.15590638e+02
 -3.57712320e+01 -1.71275325e+02  6.92667067e+02 -2.19611407e+02
 -2.16306670e+02 -1.46519766e+02 -1.69359912e+02 -2.61734564e+02
 -2.19666651e+02  4.98746795e+02 -3.43426443e+02 -1.65589065e+02
 -7.28065325e+01  8.84838006e+00 -2.29136438e+02 -2.18593814e+02
 -1.69170079e+02  3.12676754e+02  4.43664563e+02  1.39316870e+02
 -5.58734771e+01  1.82564594e+02 -7.81450987e+01  1.27563545e+01
 -2.15514282e+02 -3.30456561e+02  3.92239906e+02  3.79921452e+01
  2.17638493e+02  2.49097897e+02 -3.01799904e+02  5.87317275e+01
 -3.61390889e+00 -3.27296005e+01 -2.82541574e+02 -5.59963267e+01
 -2.08657597e+02 -2.70726468e+02 -1.91451438e+02 -5.83704538e+01
 -2.94424971e+02  2.53658342e+02  2.40034446e+02 -2.63432287e+02
  1.81036544e+02 -1.61786162e+02 -3.31763716e+02  5.66566865e+02
  7.02637832e+02 -1.22480822e+02  3.31424260e+02 -2.36963412e+02
 -2.74543348e+02  1.10972534e+03  3.32240092e+02  3.46915041e+00
 -3.62972568e+02 -1.70245250e+02 -2.69196109e+02 -5.15732660e+01
  3.81850943e+01 -1.17322524e+01  9.69285170e+02  8.83924326e+02
 -3.92406460e+02 -2.53161198e+01 -4.27130135e+02  2.75655831e+02
 -2.04716867e+02 -4.07029311e+02 -3.12101425e+02 -1.15635832e+02
 -3.01507119e+01  4.94865001e+02  6.61286474e+02  5.75834668e+02
  3.77175454e+01  5.95891974e+02 -3.61954872e+02 -3.59277570e+02
  3.52872861e+02 -2.65622558e+02 -3.51553434e+02 -2.89564428e+02
 -2.28281384e+02 -2.43675525e+02  6.30561863e+02  9.13874956e+01
 -3.76970190e+02 -3.22007038e+02 -1.21553531e+02  4.04200811e+02
 -1.94450939e+02 -2.14025738e+02 -2.63515205e+02 -1.00805023e+02
 -5.52448651e+01  2.92481350e+01 -3.41782731e+02 -2.65039587e+02
 -2.39127274e+00 -1.60836223e+02 -3.10820667e+02  2.31283765e+02
 -1.33302609e+02  2.28304564e+02 -5.63093141e+02 -9.81189654e+01
  8.58305406e+02  4.72689051e+02 -2.14851512e+02  2.38239996e+02
 -5.03633377e+01  6.19629950e+02 -2.89089086e+01 -6.74685031e+02
 -1.18644370e+02  1.91086861e+02 -2.93895147e+02 -3.13719450e+02
 -4.31405531e+02 -1.36530368e+02 -3.11127191e+02 -1.35624224e+02
 -3.69672153e+02 -2.60521587e+02  6.64249457e+02  3.02015776e+02
 -3.20216940e+02  7.34959035e+02 -1.43237629e+02 -2.52161365e+02
  7.82495007e+01 -3.09486831e+02  4.26920871e+02 -1.40607960e+03
 -1.35629624e+02 -1.90029830e+02 -1.77059226e+02  2.10964533e+02
  3.86473892e+02  2.83777784e+01 -2.63597394e+02 -4.64291027e+02
 -2.38849075e+02  3.74832957e+02  1.41776037e+02  3.69819873e+02
  1.55387956e+02 -1.89018718e+02 -1.97564085e+02 -3.30982053e+02
 -3.11448666e+02 -2.37069733e+02 -6.54987937e+01 -1.93003546e+01
  2.69125655e+01  3.85842182e+02 -7.02351759e+02 -2.54422269e+02
 -2.76468087e+02 -2.64493701e+02 -1.25014652e+02 -1.05373791e+02
 -3.86412958e+00  2.70598154e+01 -4.65562069e+01 -5.35497872e+01
 -2.29892004e+02  4.84092867e+02  7.41683414e+01  2.16324246e+02
  4.24625843e+02 -1.02390139e+02  5.88854446e+02 -1.38151210e+02
 -4.88498016e+02 -2.59449388e+02 -1.16440101e+02  4.91337078e+02
 -4.05203868e+01 -2.22746075e+02 -2.07850104e+02 -4.40278208e+01
  2.05228675e+02  3.80219599e+02 -4.58413888e+01 -5.91567874e+01
  3.74221141e+02 -6.19609833e+01  2.59050729e+02 -2.12988143e+02
 -1.56925595e+02 -1.55073602e+02  8.45529925e+01 -1.01488338e+02
  2.92443532e+02 -2.78854777e+02 -2.46840896e+02 -2.58404209e+02
 -3.05610868e+02  7.63186830e+01  1.30735015e+02 -2.16007399e+02
 -3.86257540e+02  9.19225526e+02 -2.65227472e+02  3.06199475e+02
  8.18852272e+02 -2.77219535e+02  2.53656956e+02 -1.34912483e+02
 -1.05375731e+02 -3.73630814e+02  4.33055033e+01 -5.21412038e+01
 -2.30522943e+02  2.53208178e+02 -8.92988775e+01 -1.87068689e+02
 -2.45803799e+02  4.59138058e+02 -1.31464876e+02 -2.66892108e+02
  2.15290342e+01 -2.68087230e+01 -2.56193569e+02 -2.96104805e+02
 -1.41106087e+02 -6.23313116e+01 -4.23302488e+02  2.81152674e+02
 -1.25393934e+02  2.80552410e+02 -1.52836928e+02  2.06239825e+02
 -1.54134528e+02  4.66158927e+02  2.36306957e+02  9.40921756e+01
  2.07742916e+02 -7.13740816e+01  1.34707084e+02 -2.97360598e+02
 -1.72646985e+02  6.26548296e+02 -3.52821149e+02 -2.09791903e+02
 -1.45008131e+02 -2.71916488e+02 -2.42721700e+02 -1.75844030e+01
 -1.50066801e+02 -3.13965854e+02  1.75113982e+02  3.18865682e+02
  1.98364889e+01 -2.38738733e+02  2.84089958e+02 -2.21031397e+02
  4.84647603e+02 -7.10883090e+01 -3.19488197e+02 -2.41763578e+02
 -4.67133973e+02 -3.41734054e+02  7.99970719e+02  6.27078668e+02
 -2.31960854e+02  4.81912133e+02 -2.24496125e+02 -1.50547840e+02
 -1.12842052e+02  6.92891804e+02 -2.62091093e+02 -1.88968497e+02
 -1.17925234e+02 -2.46897929e+02  5.90980501e+02 -2.32873483e+02
  4.59344564e+02 -7.07094853e+01  1.77211556e+02  7.07120205e+02
  3.20790459e+02 -8.53920178e+01  5.03921248e+02  1.21655163e+02
 -1.99529910e+02 -1.40289982e+02  7.85732987e+02  5.25318613e+01
 -8.24337158e+01 -2.83273334e+02  7.43029104e+02 -3.96401184e+02
  7.09420339e+02  2.10620722e+02  4.25334711e+02 -3.17365301e+02
 -1.69528761e+02 -3.12329770e+02 -1.10569119e+02  5.83538184e+00
  2.16907154e+02  4.59564721e+02  1.32166190e+02 -5.21066745e+02
 -2.38245975e+02  4.82875941e+01 -2.76301748e+02 -3.65782448e+02
 -2.77393285e+02  8.66918275e+02  2.49831813e+02  2.87638179e+02
 -2.45425333e+02  2.16166067e+00 -2.62457081e+02  5.74418938e+02
  1.26519221e+02 -2.88185390e+02 -3.33092657e+02 -2.37844064e+02
 -2.64476172e+02 -1.59856696e+02 -2.62465378e+02 -2.98760289e+02
 -3.34418716e+02 -1.32972547e+02 -3.37494967e+02 -3.33510848e+02
  2.04774155e+02 -3.69755551e+02 -4.34170588e+02 -6.84190113e+01
 -1.38915947e+03  4.84606223e+02  3.35099915e+02 -7.93204642e+02
  2.78668055e+02 -2.46765357e+02 -1.44949304e+02 -1.58906101e+02
  2.00574882e+02 -9.04277933e+01  2.55946103e+02  5.57996278e+02
 -1.50094213e+03 -7.65876254e+00  2.85523004e+02 -1.86433461e+00
  1.70498772e+02  1.40357358e+02 -2.24337046e+01  1.04744227e+02
 -4.55419471e+01 -3.03673399e+01 -9.18409691e+01  2.53650873e+02
  6.55110569e+02  5.38368505e+02  3.94842713e+02 -2.36210059e+02
  3.65920591e+02 -1.19173700e+02  4.43606207e+02  2.35882529e+02
 -1.72368583e+02  3.91236164e+02  3.09705049e+02  1.93479954e+02
  3.80365064e+02  3.57489743e+02 -3.38112399e+02  9.43189839e+01
  7.81643317e+01  1.64103604e+01  4.68786209e+02  1.36653156e+02
 -2.75061749e+02 -2.90270271e+02  1.17905896e+03  2.79900565e+02
  5.81600558e+02  5.78099174e+02 -2.79170223e+02 -7.62911534e+02
 -4.99811962e+02 -6.48418438e+01  2.27277458e+02  2.12081063e+02
 -2.92309271e+02 -2.86112874e+02 -2.34860082e+02 -4.28983428e+01
  4.72438065e+03 -2.19340670e+02 -2.97575372e+02  6.03134091e+01
 -2.69344572e+02 -2.70641462e+02 -7.67184863e+01  7.63201553e+02
  1.36749590e+02  4.50233154e+01  1.16568404e+02  2.50706873e+02
  6.89118739e+02  4.16432026e+02  4.78938409e+02 -8.80205975e+02
  2.42581988e+02 -1.95899492e+02 -1.27150296e+02 -4.29458247e+02
  1.70065930e+02  6.48497942e+02  4.52080586e+02  4.29174375e+02
 -4.05614245e+02  6.58044256e+02 -2.75370711e+02 -3.59732264e+02
 -5.54424510e+02  1.03312188e+02  7.17115184e+02 -3.19914848e+02
  4.41830884e+02  9.65363020e+01  4.46614570e+02 -2.85008021e+02
  6.37129552e+02  4.39257964e+02  6.16385804e+01  7.14744780e+01
  4.29715987e+02 -3.70234633e+00 -4.53328126e+01 -2.83159273e+02
  4.17910209e+02 -1.53730278e+01  2.11463690e+02 -1.55968179e+02
  8.83539636e+02 -2.34933989e+02 -1.75355664e+02 -3.13895428e+02
 -4.64570861e+00 -3.26534623e+02  4.38816176e+02  4.35988328e+02
 -5.90342721e+01  2.61528515e+02  4.58839569e+02 -8.56236266e+01
 -1.24386651e+02  4.57611406e+02 -4.78156497e+02 -1.78711570e+02
  5.88596480e+02  6.15682896e+02 -1.41303486e+02 -1.35127077e+02
  4.57657896e+02  7.14814156e+02 -1.68854024e+02 -1.62537458e+02
 -3.95134710e+02  2.67314783e+02 -3.13149672e+02  4.84409204e+02
  2.29098268e+02  4.64802430e+02 -3.69923842e+02  2.09817574e+02
 -1.92784329e+02 -4.45416421e+02 -2.05914817e+02  4.11047907e+01
 -4.70853665e+02 -2.64901875e+02 -8.37953548e+02  1.45535458e+02
 -2.14125578e+02  4.46782674e+01 -1.36674416e+02  4.34578338e+02
  7.78584398e+02 -3.75384294e+02  8.38140618e+00 -3.38407898e+02
 -2.56688505e+02 -3.20268677e+02  7.65165365e+02  2.34284562e+02
 -1.40971125e+02  9.43719499e+01  2.74112111e+02 -1.03072635e+02
 -2.75005801e+02  2.45703187e+02  8.35461100e+02 -2.57167548e+02
  1.75192438e+01 -1.36165491e+02 -3.23193780e+02  4.35923524e+02
 -3.59562178e+02 -3.25071867e+02  2.42428375e+02  2.93577402e+02
  2.36321210e+02 -3.72104798e+02  3.97561717e+02 -2.46130280e+02
  8.16881258e+01  3.95680886e+02 -3.14601795e+02 -3.39568184e+02
 -3.45843976e+02 -3.63079432e+02  1.21442935e+02 -3.15659952e+02
  7.70292069e+01  4.48382687e+02 -1.40595032e+02  3.56409482e+02
 -1.37975036e+02  9.44370678e+00  1.07515678e+02  2.58657939e+01
  1.09367288e+02 -2.08930801e+02 -9.71957998e+01  3.79260567e+02
 -5.17107135e+01  2.59814793e+02  6.43333678e+02 -1.28220648e+02
  7.58807348e+02 -2.84100068e+02 -2.91443426e+02 -1.81337306e+02
  3.97556016e+02  1.18740654e+01 -3.25196947e+02 -3.41118403e+02
  5.33325351e+02 -4.61116840e+02 -8.36692696e+01 -3.34349877e+02
  1.52287763e+02  4.03200781e+02 -2.51580191e+02  4.83386415e+02
  4.05715132e+02 -3.04892854e+02  4.44260165e+02 -5.59674068e+02
  5.95991084e+02 -3.83168903e+02  6.70088442e+02 -1.04518956e+03
 -2.88940564e+02  3.62955392e+02  4.77923837e+02 -3.65351363e+02
  4.61841181e+02 -1.71151583e+02  2.87363375e+02 -7.59760820e+01
 -1.81391292e+02 -1.39100398e+02  1.57624097e+02 -2.70907349e+02
  1.14171239e+02 -1.51972110e+02 -2.97048927e+02 -2.15206844e+02
 -1.59893891e+02  6.62537885e+02  4.47463129e+02 -9.97239425e+01
 -4.24064040e+02  5.99926572e+02 -3.91277977e+02  9.20929868e+01
  1.83395279e+02  1.12119894e+02 -3.21632006e+02  4.82642752e+02
 -3.78448158e+02 -3.18899318e+02 -2.17440111e+02 -1.46754982e+02
  5.48813375e+02 -2.09898128e+02 -1.60724219e+02  3.94069045e+02
 -2.10558638e+02 -2.98870299e+02 -6.41409491e+01 -3.87698795e+02
 -2.17374669e+02 -3.80971107e+02 -2.33698820e+02 -3.63144037e+02
 -3.22300830e+02 -4.66573184e+01  1.85589219e+02 -3.80188619e+02
 -3.71186871e+02 -3.57204807e+02 -2.28150706e+02  1.90412548e+01
 -4.00154646e+02  5.39002623e+02 -3.10810880e+02  3.92364291e+02
  6.50074685e+02  2.22906700e+02  6.42496787e+02 -2.75019559e+02
 -2.27253445e+02 -3.53006324e+02  3.86692009e+02  2.04952513e+02
  1.34476168e+02 -3.20033026e+02 -4.94865428e+02 -1.40145057e+02
  5.09107399e+02 -5.76016409e+02 -5.87803014e+02 -3.30834882e+02
 -4.40424310e+02  4.91214727e+02 -2.97000525e+02 -2.51007550e+02
 -2.86416955e+02  1.38361341e+02  4.80068976e+02 -3.82510147e+02
  1.21775368e+03 -3.09715987e+02 -6.71941325e+02 -3.32025524e+02
  8.21189261e+02 -4.40691299e+02 -2.93767650e+02 -4.15681032e+02
 -2.53781202e+02 -2.97087163e+02 -1.67015122e+02  7.80918882e+01
  6.74907057e+01 -1.85288302e+02 -8.12077742e+00 -3.28845738e+02
  8.51861269e+02  8.65456017e+02 -2.84577787e+02  2.25544092e+01
 -2.31669162e+02 -6.55926544e+00 -3.02473298e+02  2.22517966e+01
 -2.01756031e+02  2.75217266e+02 -2.39102753e+02 -1.66676978e+02
 -5.10736436e+02  1.67083910e+02  3.89399615e+02 -2.56646126e+02
 -3.49498542e+02 -3.27969058e+02  1.25105002e+02  3.73852738e+02
 -3.29575499e+02 -2.69549393e+02  1.28427282e+02  1.35484314e+01
  1.74506645e+02 -3.50023726e+02  2.86769936e+02  5.37391540e+02
  8.07017679e+02 -1.26258127e+02  3.41253259e+02  5.35180525e+02
  3.96708579e+02  8.35090293e+01 -2.46203327e+02  5.57098388e+02
 -4.22886878e+01 -9.82178819e+00  2.09707952e+02  5.55004160e+02
 -3.53826772e+02 -1.15620313e+02 -3.24477433e+02  6.04643378e+01
 -1.69852849e+02 -3.55832926e+02  1.70102577e+02 -4.53555524e+02
 -3.38739231e+02  4.29545764e+02  8.05610188e+02  3.87105044e+02
  2.29261061e+02 -3.79329891e+02  6.47863595e+02  1.20071488e+01
  3.95157045e+02  2.89153834e+02 -3.51352943e+02 -1.42535854e+02
  4.50271351e+02 -1.14067848e+02 -3.78828922e+02  8.42705424e+02
 -2.47333472e+02 -5.12141973e+02 -1.66937090e+02  5.69154346e+02
  7.07673043e+02 -3.09080093e+02 -3.74226394e+02  7.64466687e+02
 -3.79078949e+02 -4.98326694e+01 -3.86957585e+02  5.22571154e+02
 -3.66122061e+02  4.40700498e+02 -2.93826833e+02  8.76964854e+02
 -3.65184405e+02  4.25483852e+02 -4.06891358e+02 -2.43372209e+02
 -2.66722547e+02  1.61710159e+02 -1.78405411e+02  3.25752448e+02
 -2.99945410e+02  4.80057938e+02  2.13279557e+02  4.98240108e+02
 -3.37323003e+02  1.67251277e+02 -2.74305820e+02  2.29683537e+02
  4.06391877e+02 -4.33827073e+02 -1.92843408e+02  2.70385503e+02
 -3.98355576e+01  3.13012313e+02 -7.75449241e+01 -6.35133160e+01
 -9.93500315e+01 -3.28557325e+02 -3.97387794e+02 -3.18184417e+02
 -3.85016400e+02 -3.65743136e+02  2.20802268e+02 -2.78381397e+02
 -9.55686801e+02 -8.77600006e+00 -3.49727861e+02  1.30345558e+02
  5.71545147e+02 -4.65746537e+02 -3.03843595e+02  6.05735745e+02
 -4.98903467e+01 -8.40873400e+02 -2.68689786e+02  6.13856248e+02
  8.34919584e+02 -5.65965050e+01  2.74257220e+02 -6.20935488e+01
  2.68814727e+02  2.66236602e+02  4.81575075e+02 -9.68032732e+02
  2.50510725e+02  2.65606837e+01  6.68978451e+02  2.34571254e+02
  4.51680935e+01 -4.54938676e+02  5.02749800e+01 -1.50902382e+03
 -6.99746715e+01 -3.66538752e+02  3.45352676e+01 -9.39611000e+02
  8.44271098e+02 -2.66197438e+02  1.27751494e+02 -5.66861935e+02
 -7.32428749e+01 -3.72886598e+02 -1.04235662e+02 -1.97642315e+02
  3.57353595e+02 -7.87824131e+01 -9.19367268e+01 -4.94890153e+01
 -2.12387974e+02 -1.08689329e+02  5.00937831e+02 -4.40602142e+02
 -3.84933278e+02 -3.40680764e+02  3.55940441e+02 -1.51938407e+03
  3.40056156e+02  8.20523454e+01 -3.27811648e+02  4.94804430e+01
  3.24348947e+02  5.24029429e+02  6.06639067e+01  1.37512366e+02
 -1.08683031e+02  2.60665313e+02 -2.50788048e+02 -2.32620221e+02
 -7.58662137e+01 -9.59524781e+02  9.76071653e+02 -1.60221661e+02
 -3.99372288e+02  7.74381438e+01 -5.02892027e+02  1.23448111e+02
  4.90263589e+02 -2.38513267e+02 -4.50870500e+02  4.53184841e+02
 -3.94687965e+02 -3.63371845e+02  1.58254869e+02 -3.45949822e+02
 -1.41199538e+02  8.19303457e+02 -4.73900664e+02  7.95954430e+02
  8.25550869e+02  8.47512304e+02 -1.19886103e+02 -2.88345680e+02
  8.61824860e+02  8.83786295e+02 -3.94568675e+02 -4.71188450e+01
 -8.19145892e+01 -1.57081570e+02 -3.54887480e+02  1.61313589e+02
 -1.70355413e+02 -2.65663703e+02 -2.73833268e+02  3.90880491e+02
 -4.29456198e+02 -2.88509855e+02 -3.44245874e+02 -1.43812123e+02
 -3.46574142e+02 -4.17639575e+02 -4.09327907e+01 -3.05441053e+02
 -1.38313938e+02 -2.74257110e+02 -3.31670380e+02  5.80467384e+02
 -2.31296303e+02 -2.86743456e+02  5.09585536e+02 -2.94809307e+02
 -2.64547483e+02 -4.67892014e+02  2.65346963e+02 -3.89167350e+02].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## models with binary encoding

### RF

In [79]:
x_be = x_be.drop(['id_34_0', 'id_34_2', 'id_34_1'],axis = 1)
X_be_test = X_be_test.drop(['id_34', 'DeviceInfo_12'], axis = 1)

In [80]:
treeCount = 1000
m = 15

rf = RandomForestClassifier(max_features=m, min_samples_leaf=1, n_estimators=treeCount)
rf.fit(x_be, np.ravel(y_be))

Yhat_rf_be = rf.predict_proba(X_be_test)

submission_RF_be["isFraud"] = Yhat_rf_be[:, 1]

submission_RF_be.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionRFbe.csv", index = False)

### LR

In [82]:
lr = LogisticRegression(penalty='l2', max_iter=500, n_jobs=6, tol=1e-6, solver="sag")
lr.fit(x_be, np.ravel(y_be))

Yhat_lr_be = lr.predict_proba(X_be_test)
submission_be["isFraud"] = Yhat_lr_be[:, 1]

submission_be.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionLRbe.csv", index = False)




NameError: name 'Yhat_lr' is not defined

In [84]:
submission_be["isFraud"] = Yhat_lr_be[:, 1]
submission_be.to_csv("/Users/szeberinricsi/Documents/UPS/S2/MOST_AA/ieee-fraud-detection/submissionLRbe.csv", index = False)
