In [37]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [38]:
train_id_df = pd.read_csv('data/train_identity.csv')
test_id_df = pd.read_csv('data/test_identity.csv')

train_transaction_df = pd.read_csv('data/train_transaction.csv')
test_transaction_df = pd.read_csv('data/test_transaction.csv')



In [39]:
test_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [40]:
train_transaction_y = train_transaction_df.isFraud
train_transaction_df.drop('isFraud', axis=1, inplace=True)

In [41]:
#train_id_df.head()

In [42]:
#train_id_df.describe().transpose()

In [43]:
# show drop columns 

missing_threshold = 0.8  #percentage of values missing to consider drop

# missing_id_columns = train_id_df.isnull().sum()/len(train_id_df) 
# drop_columns =  (missing_id_columns[missing_id_columns > missing_threshold]).axes[0]
# train_id_df.drop(drop_columns, axis=1, inplace=True)
# test_id_df.drop(drop_columns, axis=1, inplace=True)
# print(drop_columns)

missing_transaction_columns = train_transaction_df.isnull().sum()/len(train_transaction_df) 
drop_columns =  (missing_transaction_columns[missing_transaction_columns > missing_threshold]).axes[0]
train_transaction_df.drop(drop_columns, axis=1, inplace=True)
test_transaction_df.drop(drop_columns, axis=1, inplace=True)
print(drop_columns)

Index(['dist2', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14', 'V138', 'V139',
       'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148',
       'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157',
       'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166',
       'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330',
       'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339'],
      dtype='object')


In [44]:
test_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,0.0,282.540009,282.540009,282.540009,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,67.949997,67.949997,183.850006,67.949997,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
#drop categorical cols
# cat_cols = [cname for cname in train_id_df.columns if train_id_df[cname].dtype == 'object']

# train_id_df.drop(cat_cols, axis=1, inplace=True)
# test_id_df.drop(cat_cols, axis=1, inplace=True)


cat_cols = [cname for cname in train_transaction_df.columns if train_transaction_df[cname].dtype == 'object']

train_transaction_df.drop(cat_cols, axis=1, inplace=True)
test_transaction_df.drop(cat_cols, axis=1, inplace=True)

test_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549,18403224,31.95,10409,111.0,150.0,226.0,170.0,87.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,18403263,49.0,4272,111.0,150.0,226.0,299.0,87.0,4.0,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,18403310,171.0,4476,574.0,150.0,226.0,472.0,87.0,2635.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,18403310,284.95,10989,360.0,150.0,166.0,205.0,87.0,17.0,...,0.0,282.540009,282.540009,282.540009,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,18403317,67.95,18018,452.0,150.0,117.0,264.0,87.0,6.0,...,67.949997,67.949997,183.850006,67.949997,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:

column_list = train_transaction_df.columns
column_list[column_list != 'TransactionID']

Index(['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5',
       'addr1', 'addr2', 'dist1', 'C1',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=323)

In [49]:
# impute missing values
my_imputer = SimpleImputer(strategy='median')
column_list = train_transaction_df.columns

train_transaction_df[column_list[column_list != 'TransactionID']] = pd.DataFrame(my_imputer.fit_transform(train_transaction_df[column_list[column_list != 'TransactionID']]))
#train_transaction_df.columns = column_list

test_transaction_df[column_list[column_list != 'TransactionID']] = pd.DataFrame(my_imputer.transform(test_transaction_df[column_list[column_list != 'TransactionID']]))
#test_transaction_df.columns = column_list

test_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549.0,18403224.0,31.95,10409.0,111.0,150.0,226.0,170.0,87.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550.0,18403263.0,49.0,4272.0,111.0,150.0,226.0,299.0,87.0,4.0,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551.0,18403310.0,171.0,4476.0,574.0,150.0,226.0,472.0,87.0,2635.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552.0,18403310.0,284.95,10989.0,360.0,150.0,166.0,205.0,87.0,17.0,...,0.0,282.540009,282.540009,282.540009,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553.0,18403317.0,67.95,18018.0,452.0,150.0,117.0,264.0,87.0,6.0,...,67.949997,67.949997,183.850006,67.949997,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Scale values
my_scaler = StandardScaler()

train_transaction_df[column_list[column_list != 'TransactionID']] = pd.DataFrame(my_scaler.fit_transform(train_transaction_df[column_list[column_list != 'TransactionID']]))

test_transaction_df[column_list[column_list != 'TransactionID']] = pd.DataFrame(my_scaler.transform(test_transaction_df[column_list[column_list != 'TransactionID']]))



In [51]:
test_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549.0,2.389081,-0.430993,0.104111,-1.606256,-0.281425,0.644557,-1.267894,0.069833,-0.212851,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,3663550.0,2.38909,-0.359702,-1.14804,-1.606256,-0.281425,0.644557,0.076566,0.069833,-0.200472,...,0.219762,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,3663551.0,2.3891,0.150412,-1.106417,1.350412,-0.281425,0.644557,1.879602,0.069833,10.655594,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,0.46659,-0.074142
3,3663552.0,2.3891,0.626866,0.22245,-0.016169,-0.281425,-0.813255,-0.903118,0.069833,-0.146831,...,-0.227583,2.726734,1.379778,2.191225,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,3663553.0,2.389101,-0.280467,1.656599,0.571333,-0.281425,-2.003802,-0.28821,0.069833,-0.19222,...,0.167184,0.486869,0.810775,0.352944,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [52]:
missing_transaction_columns = train_transaction_df.isnull().sum()/len(train_transaction_df) 
missing_transaction_columns

TransactionID     0.0
TransactionDT     0.0
TransactionAmt    0.0
card1             0.0
card2             0.0
                 ... 
V317              0.0
V318              0.0
V319              0.0
V320              0.0
V321              0.0
Length: 324, dtype: float64

In [11]:
# #convert low cardinality categorical columns and drop high cardinality categorical colums

# cardinality_threshold =  10

# cat_id_cols = [cname for cname in train_id_df.columns if train_id_df[cname].dtype == 'object']
# low_id_cat_cols = [cname for cname in cat_id_cols if train_id_df[cname].nunique() < cardinality_threshold]
# high_id_cat_cols = list(set(cat_id_cols) - set(low_id_cat_cols))

# print(high_id_cat_cols)

# train_id_df.drop(high_id_cat_cols, axis=1, inplace=True)
# test_id_df.drop(high_id_cat_cols, axis=1, inplace=True)


# cat_cols = [cname for cname in train_transaction_df.columns if train_transaction_df[cname].dtype == 'object']
# low_cat_cols = [cname for cname in cat_cols if train_transaction_df[cname].nunique() < cardinality_threshold]
# high_cat_cols = list(set(cat_cols) - set(low_cat_cols))


# print(high_cat_cols)

# train_transaction_df.drop(high_cat_cols, axis=1, inplace=True)
# test_transaction_df.drop(high_cat_cols, axis=1, inplace=True)


In [12]:
# # One hot encoder
# oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(train_transaction_df[low_cat_cols]))



In [53]:
# Exclude high correlation
transaction_corr = train_transaction_df.corr().abs()


In [54]:
high_corr_threshold = 0.65
upper_transaction_corr = transaction_corr.where(np.triu(np.ones(transaction_corr.shape), k=1).astype(np.bool))
to_drop = [col for col in upper_transaction_corr.columns if any(upper_transaction_corr[col] > high_corr_threshold)]


In [55]:
train_transaction_df.drop(to_drop, axis=1, inplace=True)

In [56]:
train_ratio = 0.8

train_ids = train_transaction_df.TransactionID
y = train_transaction_y
X = train_transaction_df

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=train_ratio, test_size=1-train_ratio,
                                                      random_state=0)
X_train_ids = X_train['TransactionID']
X_valid_ids = X_valid['TransactionID']

X_train.drop('TransactionID', axis=1, inplace=True)
X_valid.drop('TransactionID', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [57]:
model_xg = XGBClassifier()
model_xg.fit(X_train, y_train)
preds = model_xg.predict(X_valid)
mean_absolute_error(y_valid, preds)

0.02790666169946151

In [58]:
preds_df = pd.DataFrame(preds)
y_valid_df = pd.DataFrame(y_valid)
y_valid_df.reset_index(inplace=True)
y_valid_df.drop('index', axis=1, inplace=True)

result =pd.concat([preds_df, y_valid_df], axis=1)

# Check how many fraud cases have predicted probability > 0.5
print(len(X_valid.columns))
print(len(result[(result.isFraud == 1) & (result[0] > 0.5)]))
print(len(result[(result.isFraud == 1) & (result[0] <= 0.5)]))

83
944
3108


In [59]:
test_ids = test_transaction_df.TransactionID
X_test = test_transaction_df[X.columns]
X_test.drop('TransactionID', axis=1, inplace=True)

X_test.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(506691, 83)

In [60]:
test_pred_df = pd.DataFrame(model_xg.predict_proba(X_test))

In [61]:
pd.DataFrame(test_ids)
output_df = pd.concat([pd.DataFrame(test_ids), test_pred_df[1]], axis=1)

In [62]:
output_df.columns = ['TransactionID', 'isFraud']
output_df['TransactionID'] = output_df['TransactionID'].astype(int)
output_df.to_csv('submission2.csv', index=False)

In [63]:
test_transaction_df.head()


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549.0,2.389081,-0.430993,0.104111,-1.606256,-0.281425,0.644557,-1.267894,0.069833,-0.212851,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,3663550.0,2.38909,-0.359702,-1.14804,-1.606256,-0.281425,0.644557,0.076566,0.069833,-0.200472,...,0.219762,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,3663551.0,2.3891,0.150412,-1.106417,1.350412,-0.281425,0.644557,1.879602,0.069833,10.655594,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,0.46659,-0.074142
3,3663552.0,2.3891,0.626866,0.22245,-0.016169,-0.281425,-0.813255,-0.903118,0.069833,-0.146831,...,-0.227583,2.726734,1.379778,2.191225,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,3663553.0,2.389101,-0.280467,1.656599,0.571333,-0.281425,-2.003802,-0.28821,0.069833,-0.19222,...,0.167184,0.486869,0.810775,0.352944,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
