# Prepare Environment

Import packages and libraries needed to perform the processes in this notebook.

In [None]:
import os
import pandas as pd 
import numpy as np

import joblib
import gc

pd.set_option('display.max_columns', 100)

# Load and Process Test Data

The test is broken into two files: identity and transaction, which are joined by "TransactionID". We will load both files into the notebook.

In [None]:
test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

print(f'Size of test_id - rows: {test_id.shape[0]}, columns: {test_id.shape[1]}')
print(f'Size of test_transaction - rows: {test_transaction.shape[0]}, columns: {test_transaction.shape[1]}')

We will merge the two tests files into one.

In [None]:
test = test_transaction.merge(test_id,on=['TransactionID'],how='left') 
print(f'Size of test - rows : {test.shape[0]}, columns : {test.shape[1]}')

In [None]:
test.head()

With such a large dataset, it is important to keep track of memory usage and incorporate methods to optimize it, 

We will begin by looking at the size of the test dataframe.

In [None]:
test_GB = test.memory_usage(deep = True).sum()/1024**3 
print(f'test dataframe is using {test_GB:.2f} GB of memory storage')

To help optimize memory use, we will get rid of the intial ID and Transaction dataframes as we no longer have use for them.

In [None]:
del test_id
del test_transaction
gc.collect

# Reduce Memory Usage

From the code cell above, we see that the merged dataframe is using quite a bit of memory storage i.e over 2GB, The funtion below is for the purpose of memory optimization and will help reduce the memory usage of the test dataset.

In [None]:
def reduce_mem_usage(df):

    start_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    start_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    print(f'Initial memory usage of dataframe is {start_mem:.2f} MB/{start_mem_GB:.2f} GB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    end_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB/{end_mem_GB:.2f} GB')
    print(f'Decreased by {reduction:.1f}%')
    
    return df

test = reduce_mem_usage(test)

# Load the Model

Next, we will load the final model generated in our DT/RF training notebook.

In [None]:
model = joblib.load('../input/ieee-data/ieee_lgbm_optuna_model.joblib')
print(type(model))

# Preprocessing

We will apply the same preprocessing steps applied to the training dataset. We will first drop the TransactionID colunm, rename the "id" columns in the test dataset to have the same names as that of the training dataset,identify categorical and numerical features, drop categorical features ( as we are only interested in numerical features for now), and perform imputation to address the issue of missing values

In [None]:
test.drop(['TransactionID'], axis=1, inplace=True)

In [None]:
test.rename(columns = {'id-01':'id_01','id-02':'id_02','id-03': 'id_03','id-04': 'id_04','id-05': 'id_05','id-06': 'id_06',
              'id-07': 'id_07','id-08': 'id_08','id-09': 'id_09','id-10': 'id_10','id-11': 'id_11','id-12': 'id_12', 
              'id-13': 'id_13','id-14': 'id_14','id-15': 'id_15','id-16': 'id_16','id-17': 'id_17','id-18': 'id_18',
              'id-19': 'id_19','id-20': 'id_20','id-21': 'id_21','id-22': 'id_22','id-23': 'id_23','id-24': 'id_24',
              'id-25': 'id_25','id-26': 'id_26','id-27': 'id_27','id-28': 'id_28','id-29': 'id_29','id-30': 'id_30',
              'id-31': 'id_31','id-32': 'id_32','id-33': 'id_33','id-34': 'id_34','id-35': 'id_35','id-36': 'id_36',
              'id-37': 'id_37','id-38': 'id_38'}, inplace = 1)

In [None]:
cat_features = ['ProductCD', 'card1','card2','card3','card4','card5','card6', 'addr1','addr2', 'P_emaildomain', 'R_emaildomain',
                'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo', 'id_12', 'id_13','id_14','id_15',
                'id_16','id_17','id_18','id_19','id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30','id_31',
                'id_32','id_33','id_34','id_35','id_36','id_37','id_38']

num_features = [x for x in test.columns.values if x not in cat_features]  #slicing from 2 onwards ( first 2 columns are identifier and target)

features = num_features + cat_features

print('Categorical features :', len(cat_features))
print('Numerical features : ',len(num_features))

In [None]:
test.drop(cat_features, axis = 1, inplace=True)

print(test.shape)

**Check for missing values**

In [None]:
total_mv= test.isnull().sum().to_frame()                        # round to whole number 
percent_mv = (test.isnull().sum()/test.isnull().count()*100)    # round to 2 dp

pd.concat([total_mv, percent_mv], axis=1, keys=['Total Missing Values', 'Percent']).transpose()

**Imputation**

Load Imputer

In [None]:
imputer = joblib.load('../input/ieee-data/ieee_imputer.joblib')
imputer.fit(test)
X_test = imputer.transform(test)
print(X_test.shape)

# Test Predictions

Now, we will use the final model from the training notebook uploaded to this notebook earlier to egenrate predictions for the test dataset. 

In [None]:
test_pred= model.predict_proba(X_test)
print(test_pred.shape)

# Submission

Finally, we will load the sampel submission file provided for this competition, apply the predictions generated with our final model to it, and save the new file for submisison to the competttion.

In [None]:
submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
submission.head()

In [None]:
submission.isFraud = test_pred[:, 1]
submission.head()

In [None]:
submission.to_csv('ieee_lgbm_optuna_sub2.csv', index=False, header=True)