In [1]:
import matplotlib as mpl
print(mpl.get_cachedir())

/Users/prabhatturlapati/.matplotlib


In [2]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling as pp



In [3]:
#load data

train_identity = pd.read_csv("data/train_identity.csv")
train_transaction = pd.read_csv("data/train_transaction.csv")
test_identity = pd.read_csv("data/test_identity.csv")
test_transaction = pd.read_csv("data/test_transaction.csv")


In [4]:
print("train_transaction: "+str(train_transaction.shape)+" *** train_identity: "+str(train_identity.shape))
print("test_transaction: "+str(test_transaction.shape)+" *** test_identity: "+str(test_identity.shape))



train_transaction: (590540, 394) *** train_identity: (144233, 41)
test_transaction: (506691, 393) *** test_identity: (141907, 41)


In [5]:
train = train_transaction.join(train_identity.set_index('TransactionID'), on=['TransactionID'], how='left')
test = test_transaction.join(test_identity.set_index('TransactionID'), on=['TransactionID'], how='left')

In [116]:
train[['card1', 'card2','addr1','addr2']].head()

Unnamed: 0,card1,card2,addr1,addr2
0,13926,,315.0,87.0
1,2755,404.0,325.0,87.0
2,4663,490.0,330.0,87.0
3,18132,567.0,476.0,87.0
4,4497,514.0,420.0,87.0


In [7]:
# Compress data
# https://pythonspeed.com/articles/pandas-load-less-data/

def reduce_memory_nostr(df):
    """The objective of this function is to reduce numeric column memory usage"""
    for column in df.columns:
        # ignore strings
        if df[column].dtype =='int64':
            max_val = df[column].max()
            min_val = df[column].min()
            
            # compress to either int8, int16
            # int8 compression if data within -128 to 127
            if max_val<=127 and min_val>=-127:
                df[column] = df[column].astype("int8")
            # int16 compression if data within -32768 to 32767
            if max_val<=32767 and min_val>=-32768:
                df[column] = df[column].astype("int16")
            
        elif df[column].dtype =='float64':
            df[column] = df[column].astype('float32')
        else:
            pass
    return df


In [8]:
# Reduce memory usage for train
train_r = reduce_memory_nostr(train)
train_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float32(399), int16(2), int64(2), object(31)
memory usage: 1.0+ GB


In [87]:
X_train_int = train_r.loc[:, ~train_r.columns.isin(['isFraud'])]
Y_train = train_r.loc[:, train_r.columns.isin(['isFraud'])]
print("X_train Shape : ", X_train_int.shape)
print("Y_train Shape : ", Y_train.shape)

X_train Shape :  (590540, 433)
Y_train Shape :  (590540, 1)


In [88]:
# Reduce memory usage for test
test_r = reduce_memory_nostr(test)
# Rename Id columns as they are not consistent
col_dict = {}
for column in test_r.columns:
    if column.startswith( 'id' ):
        col_dict[column] = column.replace("-","_")
print(col_dict)
test_r.rename(columns=col_dict,  inplace = True)
X_test_int = test_r.copy()
test_r.info()


{'id_01': 'id_01', 'id_02': 'id_02', 'id_03': 'id_03', 'id_04': 'id_04', 'id_05': 'id_05', 'id_06': 'id_06', 'id_07': 'id_07', 'id_08': 'id_08', 'id_09': 'id_09', 'id_10': 'id_10', 'id_11': 'id_11', 'id_12': 'id_12', 'id_13': 'id_13', 'id_14': 'id_14', 'id_15': 'id_15', 'id_16': 'id_16', 'id_17': 'id_17', 'id_18': 'id_18', 'id_19': 'id_19', 'id_20': 'id_20', 'id_21': 'id_21', 'id_22': 'id_22', 'id_23': 'id_23', 'id_24': 'id_24', 'id_25': 'id_25', 'id_26': 'id_26', 'id_27': 'id_27', 'id_28': 'id_28', 'id_29': 'id_29', 'id_30': 'id_30', 'id_31': 'id_31', 'id_32': 'id_32', 'id_33': 'id_33', 'id_34': 'id_34', 'id_35': 'id_35', 'id_36': 'id_36', 'id_37': 'id_37', 'id_38': 'id_38'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506691 entries, 0 to 506690
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: float32(399), int16(1), int64(2), object(31)
memory usage: 899.8+ MB


In [114]:
X_train_int[['card1', 'card2','addr1','addr2']].head()


Unnamed: 0,card1,card2,addr1,addr2
0,13926,,315.0,87.0
1,2755,404.0,325.0,87.0
2,4663,490.0,330.0,87.0
3,18132,567.0,476.0,87.0
4,4497,514.0,420.0,87.0


In [89]:
# Check the train dataset and preprocess it
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
columns_to_remove = []

# Step 1 : Check for empty columns and delete them (more than 50 % empty) 
def find_empty_cols(df):
    empty_cols = []
    for column in df.columns:
        if df[column].isna().sum()/len(df[column])>=0.5:
            empty_cols.append(column)
    return empty_cols
    
# Step 2 : Check for columns with constant values 
def find_constant_cols(df):
    const_cols = []
    for column in df.columns:
        if len(df[column].unique())==1:
            const_cols.append(column)
    return const_cols       
    
# Step 3 : Scale numeric values
def scale_features(df):
    scaler = StandardScaler()
    num_cols = []
    for column in df.columns:
        if df[column].dtype!='object':
            num_cols.append(column)
            
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

# Step 4.1 : Train Target Encoding
def target_encoding(df, y):
    cols = []
    for column in df.columns:
        if df[column].dtype=='object':
            cols.append(column)
    model_te = TargetEncoder()
    df[cols] = model_te.fit_transform(df[cols],y)
    return df, model_te
            
# Step 4.1 : Test Target Encoding
def target_encoding_test(df, model_te):
    cols = []
    for column in df.columns:
        if df[column].dtype=='object':
            cols.append(column)
    df[cols] = model_te.transform(df[cols])
    return df

# Step 5 : fill nulls in the data
def fill_na(df):
    for column in df.columns:
        if df[column].dtype=='object':
            df[column].fillna(df[column].mode(), inplace=True)
        else:
            df[column].fillna(df[column].mean(), inplace=True)
    return df

# drop the empty and const val columns
empty_cols = find_empty_cols(X_train_int)
const_cols = find_constant_cols(X_train_int)
columns_to_remove.extend(empty_cols)
columns_to_remove.extend(const_cols)
columns_to_remove.append('TransactionID')

# Keep Transaction ID aside
X_train_tranid = X_train_int.loc[:, X_train_int.columns.isin(['TransactionID'])]
X_test_tranid = X_test_int.loc[:, X_test_int.columns.isin(['TransactionID'])]
print("X_train_tranid Shape : ", X_train_tranid.shape)
print("X_test_tranid Shape : ", X_test_tranid.shape)

print("X_train Shape : ", X_train_int.shape)
print("X_test Shape : ", X_test_int.shape)

X_train_tranid Shape :  (590540, 1)
X_test_tranid Shape :  (506691, 1)
X_train Shape :  (590540, 433)
X_test Shape :  (506691, 433)


In [96]:
X_train = X_train_int.loc[:, ~X_train_int.columns.isin(columns_to_remove)]
X_test = X_test_int.loc[:, ~X_test_int.columns.isin(columns_to_remove)]
print("X_train Shape : ", X_train.shape)
print("X_test Shape : ", X_test.shape)

X_train Shape :  (590540, 218)
X_test Shape :  (506691, 218)


In [97]:
# Preprocess the X_train and X_test data
X_train = fill_na(X_train)
# X_train = scale_features(X_train)
X_train,model_te = target_encoding(X_train,Y_train)


# Preprocess Test (target encoding is using the TE model used for train)
X_test = fill_na(X_test)
# X_test = scale_features(X_test)
X_test = target_encoding_test(X_test, model_te)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
  elif pd.api.types.is_categorical(cols):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [98]:
print("Class Balance")
print("Total Fraud : ", len(Y_train.loc[Y_train['isFraud']==1]))
print("Total Not Fraud : ", len(Y_train.loc[Y_train['isFraud']==0]))
print("Ratio of Fraud/Not Fraud ", len(Y_train.loc[Y_train['isFraud']==1])/ len(Y_train.loc[Y_train['isFraud']==0]))
print("Only %s percent is fraud, hence the dataset is imbalanced "%(100*len(Y_train.loc[Y_train['isFraud']==1])/ len(Y_train.loc[Y_train['isFraud']==0])))







Class Balance
Total Fraud :  20663
Total Not Fraud :  569877
Ratio of Fraud/Not Fraud  0.03625870143908247
Only 3.625870143908247 percent is fraud, hence the dataset is imbalanced 


In [99]:
# Feature Selection and reduction
profile = pp.ProfileReport(X_train, minimal=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [100]:
# profile

In [101]:
# Reduce features that done explain much. Use PCA to do this. We can implement this partially as well.


# Perform PCA on V features
import numpy as np
from sklearn.decomposition import PCA
pca_v = PCA(n_components=1)
pca_v.fit(X_train.loc[:,v_cols])
X_pca_v = pca_v.transform(X_train.loc[:,v_cols])
print(pca_v.explained_variance_ratio_, X_pca_v)

# Drop all the V columns and add the PC to the X_train dataset
X_train.drop(columns = v_cols, inplace=True)
X_train['v_pc'] = X_pca_v

# Similar operation is to be done on test
X_test_pca_v = pca_v.transform(X_test.loc[:,v_cols])
X_test.drop(columns = v_cols, inplace=True)
X_test['v_pc'] = X_test_pca_v



### card columns
card_cols = [x for x in X_train.columns if x.startswith("card")]
pca_card = PCA(n_components=1)
pca_card.fit(X_train.loc[:,card_cols])
X_pca_card = pca_card.transform(X_train.loc[:,card_cols])
print(pca_card.explained_variance_ratio_, X_pca_card)

# Drop all the card columns and add the PC to the X_train dataset
X_train.drop(columns = card_cols, inplace=True)
X_train['card_pc'] = X_pca_card

# Similar operation is to be done on test
X_test_pca_card = pca_card.transform(X_test.loc[:,card_cols])
X_test.drop(columns = card_cols, inplace=True)
X_test['card_pc'] = X_test_pca_card


[0.95255442] [[-578.9837 ]
 [-752.82965]
 [-752.82965]
 ...
 [-752.82983]
 [2036.6052 ]
 [-236.20943]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0.99890536] [[-578.9837 ]
 [-752.82965]
 [-752.82965]
 ...
 [-752.82983]
 [2036.6052 ]
 [-236.20943]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [102]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506691 entries, 0 to 506690
Data columns (total 34 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   TransactionDT   506691 non-null  int64  
 1   TransactionAmt  506691 non-null  float32
 2   ProductCD       506691 non-null  float64
 3   addr1           506691 non-null  float32
 4   addr2           506691 non-null  float32
 5   P_emaildomain   506691 non-null  float64
 6   C1              506691 non-null  float32
 7   C2              506691 non-null  float32
 8   C3              506691 non-null  float32
 9   C4              506691 non-null  float32
 10  C5              506691 non-null  float32
 11  C6              506691 non-null  float32
 12  C7              506691 non-null  float32
 13  C8              506691 non-null  float32
 14  C9              506691 non-null  float32
 15  C10             506691 non-null  float32
 16  C11             506691 non-null  float32
 17  C12       

In [117]:
# Use a model with K-Fold Cross validation

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

model_lr = LogisticRegression()
# model_xgb = GradientBoostingClassifier(n_estimators=100)
model_xgb = xgb.XGBClassifier(n_estimators = 500, eval_metric = 'auc')
# cv = KFold(n_splits=2, random_state=2)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)

scores = cross_val_score(model_xgb, X_train, Y_train, scoring='f1', cv=cv, n_jobs=-1)
print('F1 scores: %.3f (%.3f)' % (mean(scores), std(scores)))

F1 scores: 0.721 (0.006)


In [None]:
model_xgb.fit(X_train, Y_train)

  return f(**kwargs)


In [None]:
y_preds = model_xgb.predict(X_test)