# Optuna 

## Prepare Environment

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna 
import optuna.visualization as optvis
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import gc

In [None]:
def opt_plot(study, plot):
    if plot == 0: return optvis.plot_optimization_history(study)
    if plot == 1: return optvis.plot_slice(study)
    if plot == 2: return optvis.plot_parallel_coordinate(study)
    if plot == 3: return optvis.plot_contour(study)
    if plot == 4: return optvis.plot_param_importances(study)

## Load and Prepare Data

In [None]:
train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')

print(f'Size of train_id - rows: {train_id.shape[0]}, columns: {train_id.shape[1]}')
print(f'Size of train_transaction - rows:{train_transaction.shape[0]}, columns: {train_transaction.shape[1]}')

In [None]:
train = train_transaction.merge(train_id,on=['TransactionID'],how='left') 
print(f'Size of train - rows : {train.shape[0]}, columns : {train.shape[1]}')

In [None]:
train.head()

In [None]:
train_GB = (train.memory_usage(deep = True).sum()/1024**3)


print(f'The train dataframe is taking up about {train_GB:.2f} GB of memory storage')

# Memory Optimization

In [None]:
def reduce_mem_usage(df):

    start_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    start_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    print(f'Initial memory usage of dataframe is {start_mem:.2f} MB/{start_mem_GB:.2f} GB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    end_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB/{end_mem_GB:.2f} GB')
    print(f'Decreased by {reduction:.1f}%')
    
    return df

train = reduce_mem_usage(train)

### Check for Missing Values

In [None]:
total_mv= train.isnull().sum().to_frame()                        #round to whole number 
percent_mv = (train.isnull().sum()/train.isnull().count()*100)   # round to 2 dp

pd.concat([total_mv, percent_mv], axis=1, keys=['Total Missing Values', 'Percent']).transpose()

### Label Distribution

In [None]:
target_count = train['isFraud'].value_counts()
target_percent = train['isFraud'].value_counts()/len(train)

print('Target Column : isFraud')
pd.concat([ target_count, target_percent], axis=1, keys=['Count', 'Percent'])

In [None]:
train['isFraud'].value_counts().plot(kind='bar', 
                                     figsize=(7, 5), 
                                     xlabel = "Fraudulent(Yes/No)",
                                     ylabel ="Count of Transactions",
                                     title= "Count of Fraudulent vs Non-Fraudulent Transactions")

Target feature is unbalanced. needs to be balanced

### Feature Engineering and Preprocessing

In [None]:
y_train = train.isFraud.values
train.drop(['TransactionID', 'isFraud'], axis=1, inplace=True)
X_train = train

In [None]:
cat_features = ['ProductCD', 'card1','card2','card3','card4','card5','card6', 'addr1','addr2', 'P_emaildomain', 'R_emaildomain',
                'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo', 'id_12', 'id_13','id_14','id_15',
                'id_16','id_17','id_18','id_19','id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30','id_31',
                'id_32','id_33','id_34','id_35','id_36','id_37','id_38']

num_features = [x for x in train.columns.values if x not in cat_features]  #slicing from 2 onwards ( first 2 columns are identifier and target)

features = num_features + cat_features

print('Categorical features :', len(cat_features))
print('Numerical features : ',len(num_features))

In [None]:
train_idx, valid_idx = train_test_split(range(len(X_train)), test_size=0.8, random_state=1, stratify=y_train)

print(len(train_idx))
print(len(valid_idx))

indices = [(train_idx, valid_idx)]

In [None]:
del train
gc.collect()

### LightGBM

In [None]:
%%time 

def lgbm_objective(trial):
    
    n  = trial.suggest_int('n_estimators', 20, 150)
    md = trial.suggest_int('max_depth', 2, 40)
    nl = trial.suggest_int('num_leaves', 50, 500)
    lr = trial.suggest_float('learning_rate', 0, 1, log=False)
    ss = trial.suggest_float('subsample', 0.6, 1, log=False)
    nj = trial.suggest_int('n_jobs', 1, 4) 
    bt = trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss'])
    
    
    clf =  LGBMClassifier(n_estimators=n, max_depth=md, learning_rate=lr, boosting_type=bt,num_leaves = nl, n_jobs = nj,
                          subsample=ss, random_state=1)
    
    scores = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=indices, scoring='roc_auc')
    return scores.mean()
    
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(lgbm_objective, n_trials=20)

print()
print(lgbm_study.best_value)
print(lgbm_study.best_params)

In [None]:
print(lgbm_study.best_value)
print(lgbm_study.best_params)

In [None]:
opt_plot(lgbm_study, plot=0)

In [None]:
opt_plot(lgbm_study, plot=1)

# Train Final Model

In [None]:
final_model = lgb_model = LGBMClassifier(random_state=1, **lgbm_study.best_params)
final_model.fit(X_train, y_train)

print(final_model.score(X_train, y_train))

# Load and Process Test Data

In [None]:
test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

print(f'Size of test_id - rows: {test_id.shape[0]}, columns: {test_id.shape[1]}')
print(f'Size of test_transaction - rows: {test_transaction.shape[0]}, columns: {test_transaction.shape[1]}')

In [None]:
test = test_transaction.merge(test_id,on=['TransactionID'],how='left') 
print(f'Size of test - rows : {test.shape[0]}, columns : {test.shape[1]}')

In [None]:
test.head()

In [None]:
test_GB = test.memory_usage(deep = True).sum()/1024**3 
print(f'test dataframe is using {test_GB:.2f} GB of memory storage')

In [None]:
del test_id
del test_transaction
gc.collect

# Memory Optimization

In [None]:
def reduce_mem_usage(df):

    start_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    start_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    print(f'Initial memory usage of dataframe is {start_mem:.2f} MB/{start_mem_GB:.2f} GB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage(index=True, deep=True).sum() / 1024**2
    end_mem_GB = df.memory_usage(index=True, deep=True).sum() / 1024**3
    reduction = 100 * (start_mem - end_mem) / start_mem
    print(f'Memory usage after optimization is: {end_mem:.2f} MB/{end_mem_GB:.2f} GB')
    print(f'Decreased by {reduction:.1f}%')
    
    return df

test = reduce_mem_usage(test)

In [None]:
test.drop(['TransactionID'], axis=1, inplace=True)


In [None]:
test.rename(columns = {'id-01':'id_01','id-02':'id_02','id-03': 'id_03','id-04': 'id_04','id-05': 'id_05','id-06': 'id_06',
              'id-07': 'id_07','id-08': 'id_08','id-09': 'id_09','id-10': 'id_10','id-11': 'id_11','id-12': 'id_12', 
              'id-13': 'id_13','id-14': 'id_14','id-15': 'id_15','id-16': 'id_16','id-17': 'id_17','id-18': 'id_18',
              'id-19': 'id_19','id-20': 'id_20','id-21': 'id_21','id-22': 'id_22','id-23': 'id_23','id-24': 'id_24',
              'id-25': 'id_25','id-26': 'id_26','id-27': 'id_27','id-28': 'id_28','id-29': 'id_29','id-30': 'id_30',
              'id-31': 'id_31','id-32': 'id_32','id-33': 'id_33','id-34': 'id_34','id-35': 'id_35','id-36': 'id_36',
              'id-37': 'id_37','id-38': 'id_38'}, inplace = 1)

# Test Predictions

In [None]:
X_test = test

In [None]:
test_pred= final_model.predict_proba(X_test)
print(test_pred.shape)

# Submission

In [None]:
submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
submission.head()

In [None]:
submission.isFraud = test_pred[:, 1]
submission.head()

In [None]:
submission.to_csv('ieee_lgbm_optuna_all.csv', index=False, header=True)