In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
train_id_df = pd.read_csv('data/train_identity.csv')
test_id_df = pd.read_csv('data/test_identity.csv')

train_transaction_df = pd.read_csv('data/train_transaction.csv')
test_transaction_df = pd.read_csv('data/test_transaction.csv')



In [3]:
train_transaction_y = train_transaction_df.isFraud
train_transaction_df.drop('isFraud', axis=1, inplace=True)

In [4]:
#train_id_df.head()

In [5]:
#train_id_df.describe().transpose()

In [6]:
train_transaction_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TransactionID,590540.0,3.282270e+06,1.704744e+05,2987000.000,3134634.750,3282269.500,3429904.25,3.577539e+06
TransactionDT,590540.0,7.372311e+06,4.617224e+06,86400.000,3027057.750,7306527.500,11246620.00,1.581113e+07
TransactionAmt,590540.0,1.350272e+02,2.391625e+02,0.251,43.321,68.769,125.00,3.193739e+04
card1,590540.0,9.898735e+03,4.901170e+03,1000.000,6019.000,9678.000,14184.00,1.839600e+04
card2,581607.0,3.625555e+02,1.577932e+02,100.000,214.000,361.000,512.00,6.000000e+02
...,...,...,...,...,...,...,...,...
V335,82351.0,5.916455e+01,3.876295e+02,0.000,0.000,0.000,0.00,5.512500e+04
V336,82351.0,2.853090e+01,2.745769e+02,0.000,0.000,0.000,0.00,5.512500e+04
V337,82351.0,5.535242e+01,6.684868e+02,0.000,0.000,0.000,0.00,1.040600e+05
V338,82351.0,1.511605e+02,1.095034e+03,0.000,0.000,0.000,0.00,1.040600e+05


In [7]:
# show drop columns 

missing_threshold = 0.6  #percentage of values missing to consider drop

# missing_id_columns = train_id_df.isnull().sum()/len(train_id_df) 
# drop_columns =  (missing_id_columns[missing_id_columns > missing_threshold]).axes[0]
# train_id_df.drop(drop_columns, axis=1, inplace=True)
# test_id_df.drop(drop_columns, axis=1, inplace=True)
# print(drop_columns)

missing_transaction_columns = train_transaction_df.isnull().sum()/len(train_transaction_df) 
drop_columns =  (missing_transaction_columns[missing_transaction_columns > missing_threshold]).axes[0]
train_transaction_df.drop(drop_columns, axis=1, inplace=True)
test_transaction_df.drop(drop_columns, axis=1, inplace=True)
print(drop_columns)

Index(['dist2', 'R_emaildomain', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',
       'V138',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=168)


In [8]:
# train_id_df.describe().transpose()

In [9]:
train_transaction_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#drop categorical cols
# cat_cols = [cname for cname in train_id_df.columns if train_id_df[cname].dtype == 'object']

# train_id_df.drop(cat_cols, axis=1, inplace=True)
# test_id_df.drop(cat_cols, axis=1, inplace=True)


cat_cols = [cname for cname in train_transaction_df.columns if train_transaction_df[cname].dtype == 'object']

train_transaction_df.drop(cat_cols, axis=1, inplace=True)
test_transaction_df.drop(cat_cols, axis=1, inplace=True)

In [11]:
test_transaction_df.columns

Index(['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2',
       'card3', 'card5', 'addr1', 'addr2', 'dist1',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=212)

In [12]:
train_transaction_df.columns

Index(['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2',
       'card3', 'card5', 'addr1', 'addr2', 'dist1',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=212)

In [13]:
# impute missing values
my_imputer = SimpleImputer(strategy='median')
column_list = train_transaction_df.columns

train_transaction_df = pd.DataFrame(my_imputer.fit_transform(train_transaction_df))
train_transaction_df.columns = column_list

test_transaction_df = pd.DataFrame(my_imputer.transform(test_transaction_df))
test_transaction_df.columns = column_list

In [14]:
# Scale values
my_scaler = StandardScaler()

train_transaction_df = pd.DataFrame(my_scaler.fit_transform(train_transaction_df))
train_transaction_df.columns = column_list

test_transaction_df = pd.DataFrame(my_scaler.transform(test_transaction_df))
test_transaction_df.columns = column_list


In [15]:
missing_transaction_columns = train_transaction_df.isnull().sum()/len(train_transaction_df) 
missing_transaction_columns

TransactionID     0.0
TransactionDT     0.0
TransactionAmt    0.0
card1             0.0
card2             0.0
                 ... 
V317              0.0
V318              0.0
V319              0.0
V320              0.0
V321              0.0
Length: 212, dtype: float64

In [16]:
# #convert low cardinality categorical columns and drop high cardinality categorical colums

# cardinality_threshold =  10

# cat_id_cols = [cname for cname in train_id_df.columns if train_id_df[cname].dtype == 'object']
# low_id_cat_cols = [cname for cname in cat_id_cols if train_id_df[cname].nunique() < cardinality_threshold]
# high_id_cat_cols = list(set(cat_id_cols) - set(low_id_cat_cols))

# print(high_id_cat_cols)

# train_id_df.drop(high_id_cat_cols, axis=1, inplace=True)
# test_id_df.drop(high_id_cat_cols, axis=1, inplace=True)


# cat_cols = [cname for cname in train_transaction_df.columns if train_transaction_df[cname].dtype == 'object']
# low_cat_cols = [cname for cname in cat_cols if train_transaction_df[cname].nunique() < cardinality_threshold]
# high_cat_cols = list(set(cat_cols) - set(low_cat_cols))


# print(high_cat_cols)

# train_transaction_df.drop(high_cat_cols, axis=1, inplace=True)
# test_transaction_df.drop(high_cat_cols, axis=1, inplace=True)


In [17]:
# # One hot encoder
# oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(train_transaction_df[low_cat_cols]))



In [18]:
# Exclude high correlation
transaction_corr = train_transaction_df.corr().abs()


In [19]:
high_corr_threshold = 0.60
upper_transaction_corr = transaction_corr.where(np.triu(np.ones(transaction_corr.shape), k=1).astype(np.bool))
to_drop = [col for col in upper_transaction_corr.columns if any(upper_transaction_corr[col] > high_corr_threshold)]


In [20]:
train_transaction_df.drop(to_drop, axis=1, inplace=True)

In [21]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [22]:
train_ratio = 0.8

train_ids = train_transaction_df.TransactionID
y = train_transaction_y
X = train_transaction_df

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=train_ratio, test_size=1-train_ratio,
                                                      random_state=0)
X_train_ids = X_train['TransactionID']
X_valid_ids = X_valid['TransactionID']

X_train.drop('TransactionID', axis=1, inplace=True)
X_valid.drop('TransactionID', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [23]:
model_rf = RandomForestRegressor(n_estimators=50, random_state=0, n_jobs=-1)
model_rf.fit(X_train, y_train)
preds = model_rf.predict(X_valid)
mean_absolute_error(y_valid, preds)

0.04018570323332449

In [24]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_valid, preds, pos_label=2)



In [25]:
model_lg = LogisticRegression(tol=1e-4, solver='lbfgs', random_state=0, n_jobs=-1, max_iter=1000, verbose=1)
model_lg.fit(X_train, y_train)
preds = model_lg.predict(X_valid)
mean_absolute_error(y_valid, preds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   33.6s finished


0.033122227114166694

In [29]:
preds = model_lg.predict_proba(X_valid)
preds_df = pd.DataFrame(preds)
preds_df
#preds = model_lg.predict_proba(X_valid)
#mean_absolute_error(y_valid, preds)

Unnamed: 0,0,1
0,0.970186,0.029814
1,0.960551,0.039449
2,0.947688,0.052312
3,0.960576,0.039424
4,0.980347,0.019653
...,...,...
118103,0.953204,0.046796
118104,0.972562,0.027438
118105,0.971808,0.028192
118106,0.960720,0.039280


In [27]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_valid, preds, pos_label=1)

In [30]:
pd.DataFrame(preds)[1]

0         0.029814
1         0.039449
2         0.052312
3         0.039424
4         0.019653
            ...   
118103    0.046796
118104    0.027438
118105    0.028192
118106    0.039280
118107    0.022511
Name: 1, Length: 118108, dtype: float64

In [31]:
y_valid_df = pd.DataFrame(y_valid)
y_valid_df.reset_index(inplace=True)
y_valid_df.drop('index', axis=1, inplace=True)

result =pd.concat([pd.DataFrame(preds)[1], y_valid_df], axis=1)

# Check how many fraud cases have predicted probability > 0.5
len(result[(result.isFraud == 1) & (result[1] > 0.5)])

249

In [None]:
model_xg = XGBClassifier()
model_xg.fit(X_train, y_train)
preds = model_xg.predict(X_valid)
mean_absolute_error(y_valid, preds)

In [None]:
preds_df = pd.DataFrame(preds)
y_valid_df = pd.DataFrame(y_valid)
y_valid_df.reset_index(inplace=True)
y_valid_df.drop('index', axis=1, inplace=True)

result =pd.concat([preds_df, y_valid_df], axis=1)

# Check how many fraud cases have predicted probability > 0.5
len(result[(result.isFraud == 1) & (result[0] > 0.5)])

In [None]:
test_ids = test_transaction_df.TransactionID
X_test = test_transaction_df[X.columns]
X_test.drop('TransactionID', axis=1, inplace=True)

X_test.shape

In [None]:
test_pred_df = pd.DataFrame(model_xg.predict_proba(X_test))

In [None]:
pd.DataFrame(test_ids)
output_df = pd.concat([pd.DataFrame(test_ids), test_pred_df[1]], axis=1)

In [None]:
output_df.columns = ['TransactionID', 'isFraud']
output_df['TransactionID'] = output_df['TransactionID'].astype(int)
output_df.to_csv('submission.csv', index=False)