In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_validate
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,VotingClassifier
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression,LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
import gc
from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

In [None]:
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
train_identity.shape

In [None]:
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")

test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [None]:
#train_df= train_df.rename(columns=lambda x:"".join(x.split("_")))
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))
#test_df= test_df.rename(columns=lambda x:"".join(x.split("_")))


# Reduce Memory

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

# Reduction Data

In [None]:
#train_df=train_df.sample(frac=0.2, random_state=3)

In [None]:
# cat_th=61 and car_th=74 we decided after eda.
def grab_col_names(dataframe, cat_th=61, car_th=74):
    """

    It gives the names of categorical, numerical and categorical but cardinal variables in the data set.
    Note: Categorical variables with numerical appearance are also included in categorical variables.

    Parameters
    ------
        dataframe: dataframe
                The dataframe from which variable names are to be retrieved
        cat_th: int, optional
                Class threshold value for numeric but categorical variables
        car_th: int, optinal
                Class threshold for categorical but cardinal variables

    Returns
    ------
        cat_cols: list
                Categorical variable list
        num_cols: list
                Numerical variable list
        cat_but_car: list
                Categorical view cardinal variable list

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = the total number of variables
        num_but_cat is inside cat_cols.
        The sum of 3 lists with return is equal to the total number of variables: cat_cols + num_cols + cat_but_car = number of variables

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

In [None]:
cat_cols, num_cols, cat_but_car =grab_col_names(train_df)

# Some Outliers

In [None]:
# droptamt = train_df[train_df['TransactionAmt']>10000].index
# train_df.drop(droptamt, inplace=True)

# dropdist1 = train_df[train_df['dist1']>6000].index
# train_df.drop(dropdist1, inplace=True)

# dropdist2 = train_df[train_df['dist2']>8000].index
# train_df.drop(dropdist2, inplace=True)

# dropc1 = train_df[train_df['C1'] > 2000].index
# train_df.drop(dropc1, inplace=True)

# dropc2 = train_df[train_df['C2'] > 2000].index
# train_df.drop(dropc2, inplace=True)

# #dropping the -ve values
# dropd4 = train_df[train_df['D4']<0].index
# train_df.drop(dropd4, inplace=True)

# dropd6 = train_df[train_df['D6']<0].index
# train_df.drop(dropd6, inplace=True)

# dropd11 = train_df[train_df['D11']<0].index
# train_df.drop(dropd11, inplace=True)

# dropd12 = train_df[train_df['D12']<0].index
# train_df.drop(dropd12, inplace=True)

# dropd14 = train_df[train_df['D14']<0].index
# train_df.drop(dropd14, inplace=True)

# dropd15 = train_df[train_df['D15']<0].index
# train_df.drop(dropd15, inplace=True)

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    
    if na_name:
        return na_columns,missing_df

In [None]:
na_cols,missing_df=missing_values_table(train_df, True)
missing_df.reset_index(inplace=True)

# Feature Engineering

In [None]:
def feature_eng(dataframe,name=True):
    
    def card3Values(val):
        if val==np.nan:
            return val
        else:
            if val > 150:
                return 'T'
            else:
                return 'F'
    
    dataframe['card3Values'] = dataframe['card3'].apply(card3Values)
    
    def replaceToOther(value):
        if value==np.nan:
            return value
        if value=='debit or credit' or value=='charge card':
            return 'debit'
        else:
            return value
     
    dataframe['card6'] = dataframe['card6'].apply(replaceToOther)
    
    def returnfirst(email):
        return email.split(".")[0]
    
    dataframe['first'] = dataframe[~dataframe['P_emaildomain'].isnull()]['P_emaildomain'].apply(returnfirst)
    
    emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum',
    'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo',
    'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 
    'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 
    'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',  'hotmail.fr': 'microsoft', 
    'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 
    'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 
    'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 
    'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 
    'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
    'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 
    'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
    us_emails = ['gmail', 'net', 'edu']
    
    for c in ['P_emaildomain', 'R_emaildomain']:
        dataframe[c + '_bin'] = dataframe[c].map(emails)
        dataframe[c + '_suffix'] = dataframe[c].map(lambda x: str(x).split('.')[-1])
        dataframe[c + '_suffix'] = dataframe[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    
    #since we have created 2 features now dropping the "first" columns
    dataframe.drop(['first'],axis=1,inplace=True)
    
    dataframe["new_domain"]=dataframe['P_emaildomain'].apply(lambda x:str(x).partition(".")[0])
    dataframe["new_domain1"]=dataframe['P_emaildomain'].apply(lambda x:str(x).partition(".")[-1])
    dataframe["new_domain2"]=dataframe['R_emaildomain'].apply(lambda x:str(x).partition(".")[-1])
    
    def id31_split(data):
        data['browser_id_31'] = data['id_31'].str.split(' ', expand=True)[0]
        data['version_id_31'] = data['id_31'].str.split(' ', expand=True)[1]

    id31_split(dataframe)
    
    def id30_split(data):
        data['OS_id_30'] = data['id_30'].str.split(' ', expand=True)[0]
        data['version_id_30'] = data['id_30'].str.split(' ', expand=True)[1]

    id30_split(dataframe)
    
    def id33_split(data):
        data['screen_width'] = data['id_33'].str.split('x', expand=True)[0]
        data['screen_height'] = data['id_33'].str.split('x', expand=True)[1]
        
    id33_split(dataframe)
    
    
    def afterDecimalCount(amt):
        amtString = str(amt)
        return len(amtString.split(".")[1])
    
    dataframe['TransDecimalCount'] = dataframe['TransactionAmt'].apply(afterDecimalCount)
    
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]
    
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    dataframe['had_id'] = "1"
    
    dataframe['TransactionAmt_to_mean_card1'] = dataframe['TransactionAmt'] / dataframe.groupby(['card1'])['TransactionAmt'].transform('mean')
    dataframe['TransactionAmt_to_mean_card4'] = dataframe['TransactionAmt'] / dataframe.groupby(['card4'])['TransactionAmt'].transform('mean')
    dataframe['TransactionAmt_to_std_card1'] = dataframe['TransactionAmt'] / dataframe.groupby(['card1'])['TransactionAmt'].transform('std')
    dataframe['TransactionAmt_to_std_card4'] = dataframe['TransactionAmt'] / dataframe.groupby(['card4'])['TransactionAmt'].transform('std')
    dataframe['TransactionAmt_to_std_addr1'] = dataframe['TransactionAmt'] / dataframe.groupby(['addr1'])['TransactionAmt'].transform('std')
    dataframe['TransactionAmt_to_mean_addr1'] = dataframe['TransactionAmt'] / dataframe.groupby(['addr1'])['TransactionAmt'].transform('mean')
    
    dataframe['id_02_to_mean_card1'] = dataframe['id_02'] / dataframe.groupby(['card1'])['id_02'].transform('mean')
    dataframe['id_02_to_mean_card4'] = dataframe['id_02'] / dataframe.groupby(['card4'])['id_02'].transform('mean')
    dataframe['id_02_to_std_card1'] = dataframe['id_02'] / dataframe.groupby(['card1'])['id_02'].transform('std')
    dataframe['id_02_to_std_card4'] = dataframe['id_02'] / dataframe.groupby(['card4'])['id_02'].transform('std')
    dataframe['id_02_to_mean_addr1'] = dataframe['id_02'] / dataframe.groupby(['addr1'])['id_02'].transform('mean')
    dataframe['id_02_to_std_addr1'] = dataframe['id_02'] / dataframe.groupby(['addr1'])['id_02'].transform('std')
    
    dataframe['D15_to_mean_card1'] = dataframe['D15'] / dataframe.groupby(['card1'])['D15'].transform('mean')
    dataframe['D15_to_mean_card4'] = dataframe['D15'] / dataframe.groupby(['card4'])['D15'].transform('mean')
    dataframe['D15_to_std_card1'] = dataframe['D15'] / dataframe.groupby(['card1'])['D15'].transform('std')
    dataframe['D15_to_std_card4'] = dataframe['D15'] / dataframe.groupby(['card4'])['D15'].transform('std')
    dataframe['D15_to_mean_addr1'] = dataframe['D15'] / dataframe.groupby(['addr1'])['D15'].transform('mean')
    dataframe['D15_to_std_addr1'] = dataframe['D15'] / dataframe.groupby(['addr1'])['D15'].transform('std')
    
    dataframe['card1check']=dataframe["card1"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['card2check']=dataframe["card2"].apply(lambda x: 0 if np.isnan(x)  else 1)
    
    # ADD UID FEATURE
    
    dataframe["addr1_addr2"] = dataframe["addr1"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card1_card2"] = dataframe["card1"].astype(str) +'_'+ dataframe["card2"].astype(str)
    
    dataframe["card1_addr1"] = dataframe["card1"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card1_addr2"] = dataframe["card1"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card2_addr1"] = dataframe["card2"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card2_addr2"] = dataframe["card2"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card3_addr1"] = dataframe["card3"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card3_addr2"] = dataframe["card3"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card4_addr1"] = dataframe["card4"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card4_addr2"] = dataframe["card4"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card5_addr1"] = dataframe["card5"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card5_addr2"] = dataframe["card5"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    dataframe["card6_addr1"] = dataframe["card6"].astype(str) +'_'+ dataframe["addr1"].astype(str)
    dataframe["card6_addr2"] = dataframe["card6"].astype(str) +'_'+ dataframe["addr2"].astype(str)
    
    dataframe["card1_addr1_P_emaildomain"] = dataframe["card1_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card1_addr2_P_emaildomain"] = dataframe["card1_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card2_addr1_P_emaildomain"] = dataframe["card2_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card2_addr2_P_emaildomain"] = dataframe["card2_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card3_addr1_P_emaildomain"] = dataframe["card3_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card3_addr2_P_emaildomain"] = dataframe["card3_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card4_addr1_P_emaildomain"] = dataframe["card4_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card4_addr2_P_emaildomain"] = dataframe["card4_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card5_addr1_P_emaildomain"] = dataframe["card5_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card5_addr2_P_emaildomain"] = dataframe["card5_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card6_addr1_P_emaildomain"] = dataframe["card6_addr1"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    dataframe["card6_addr2_P_emaildomain"] = dataframe["card6_addr2"].astype(str) +'_'+ dataframe["P_emaildomain"].astype(str)
    
    
    
    import datetime
    # ADD MONTH FEATURE
    START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
    dataframe['DT_M'] = dataframe['TransactionDT'].map(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    dataframe['DT_M'] = (dataframe['DT_M'].dt.year-2017)*12 + dataframe['DT_M'].dt.month 


    dataframe['t_dt_minute']=dataframe["TransactionDT"]/60
    dataframe['t_dt_hour']=dataframe["t_dt_minute"]/60
    dataframe['t_dt_day']=dataframe["t_dt_hour"]/24
    dataframe['t_dt_week']=dataframe["t_dt_day"]/7
    
    dataframe['dayofweek'] = (dataframe['TransactionDT']//(60*60*24)-1)%7
    
    dataframe['hour'] = (dataframe['TransactionDT']/(3600))%24
    
    def hourFeature(hour):
        if hour > 3 and hour < 12:
            return "highalert"
        if hour > 18 and hour <= 23:
            return "mediumalert"
        else:
            return "lowalert"
        
    dataframe['alertFeature'] = dataframe['hour'].apply(hourFeature)
    
    dataframe['LogTransactionAmt'] = np.log(dataframe['TransactionAmt'])
    
    dataframe['D2_check']=dataframe["D2"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D3_check']=dataframe["D3"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D4_check']=dataframe["D4"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D5_check']=dataframe["D5"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D6_check']=dataframe["D6"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D7_check']=dataframe["D7"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D8_check']=dataframe["D8"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D9_check']=dataframe["D9"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D10_check']=dataframe["D10"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D11_check']=dataframe["D11"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D12_check']=dataframe["D12"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D13_check']=dataframe["D13"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D14_check']=dataframe["D14"].apply(lambda x: 0 if np.isnan(x)  else 1)
    dataframe['D15_check']=dataframe["D15"].apply(lambda x: 0 if np.isnan(x)  else 1)

    gc.collect()

    return dataframe

In [None]:
train_df = feature_eng(train_df)

In [None]:
test_df = feature_eng(test_df)

# I Converted Infinity Values to Nan

In [None]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop variables that are missing more than 60% 

In [None]:
drop_variables=missing_df[missing_df["ratio"]>60]['index']

In [None]:
train_df.drop(drop_variables,inplace=True,axis=1)


In [None]:
test_df.drop(drop_variables,inplace=True,axis=1)

In [None]:
drop_missings = drop_variables.values.tolist()

# Drop correlated and useless variables

In [None]:
drop_corr = ['V11','V16','V18', 'V21', 'V22', 'V28', 'V29','V30', 'V31', 'V32', 'V33', 'V34', 'V40', 'V41', 'V42', 'V43', 
             'V50', 'V51', 'V52', 'V57', 'V59', 'V60', 'V63', 'V64', 'V65', 'V68', 'V69', 'V71', 'V72', 'V73', 'V79', 'V80', 
             'V81', 'V84', 'V85', 'V88', 'V89', 'V90', 'V92', 'V93', 'V97', 'V101', 'V103', 'V105', 'V106', 'V113', 'V119', 
             'V128', 'V132', 'V134', 'V137', 'V142', 'V143', 'V145', 'V150', 'V151', 'V153', 'V154', 'V155', 'V157', 'V159', 
             'V162', 'V163', 'V167', 'V168', 'V177', 'V178', 'V179', 'V181', 'V182', 'V183', 'V190', 'V191', 'V192', 'V193',
             'V195', 'V196', 'V197', 'V198', 'V199', 'V202', 'V203', 'V204', 'V211', 'V212', 'V216', 'V217', 'V218', 'V219', 
             'V222', 'V225', 'V228', 'V230','V231', 'V232', 'V233', 'V235', 'V236', 'V237', 'V239', 'V243', 'V244', 'V248', 
             'V249', 'V251', 'V253', 'V254', 'V256', 'V257', 'V259', 'V262', 'V263', 'V265', 'V268', 'V269', 'V271', 'V272', 
             'V273', 'V274', 'V275', 'V276', 'V277', 'V279', 'V280', 'V292', 'V293', 'V295', 'V297', 'V298', 'V299', 'V302', 
             'V304', 'V306', 'V316', 'V318', 'V319', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V330',
             'V331', 'V332', 'V333', 'V334', 'V336', 'V338', 'V339', 'C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12',
             'C14', 'D2', 'D6', 'D7', 'D12']

drop_ids = ['id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10','id_11','id_12','id_13','id_14','id_15',
            'id_16','id_17','id_18','id_19','id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30',
            'id_31','id_32','id_33','id_34','id_35','id_36','id_37','id_38']

drop_useless = ["TransactionID", "TransactionDT", "P_emaildomain", "R_emaildomain", "DeviceInfo"]

drop_useless2 = ['M1', 'V1', 'V14', 'V305']

In [None]:
drop_total = drop_corr + drop_ids + drop_useless + drop_useless2


In [None]:
added_drop = [col for col in drop_total if col not in drop_missings]
added_drop

In [None]:
train_df.head()

In [None]:
train_df.drop(added_drop, inplace=True, axis=1)
test_df.drop(added_drop, inplace=True, axis=1)

In [None]:
cat_cols, num_cols, cat_but_car =grab_col_names(train_df)

# Filling Missing Values


In [None]:
# def fill_na(dataframe,column,coltype):
#     dataframe[column] = dataframe[column].apply(lambda x: x.fillna(x.mode()) if (coltype == "cat") else x.fillna(x.median()))

In [None]:
# cat_cols, num_cols, cat_but_car = grab_col_names(train_df)
# for column in cat_cols:
#     fill_na(train_df,column,"cat")

In [None]:
# for column in num_cols:
#     fill_na(train_df,column,"num")

In [None]:
# for i in num_cols:
#     print(f"{i} :{type(i)}")

In [None]:
# for i in num_cols:
#     print(f"{i} :{train_df[i].dtype}")

In [None]:
for i in train_df.columns:
    if train_df[i].dtypes=='int64' or train_df[i].dtypes=='float64':
        train_df[i].fillna(train_df[i].mean(),inplace=True)

In [None]:
for i in test_df.columns:
    if test_df[i].dtypes=='int64' or test_df[i].dtypes=='float64':
        test_df[i].fillna(test_df[i].mean(),inplace=True)

In [None]:
for i in train_df.columns:
    if train_df[i].dtypes=='object':
        train_df[i].fillna(train_df[i].mode()[0],inplace=True)

In [None]:
for i in test_df.columns:
    if test_df[i].dtypes=='object':
        test_df[i].fillna(test_df[i].mode()[0],inplace=True)

In [None]:
# train_df = train_df.apply(lambda x: x.fillna(x.mode()) if (x.dtype == "O") else x, axis=0)


In [None]:
# train_df = train_df.apply(lambda x: x.fillna(x.median()) if (x.dtype != "O")  else x, axis=0)

In [None]:
# train_df.isnull().sum()

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(train_df)

In [None]:
train_df.isnull().sum()

In [None]:
train_df = pd.get_dummies(train_df[cat_cols + num_cols], drop_first=True)

In [None]:
cat_cols.remove("isFraud")

In [None]:
test_df = pd.get_dummies(test_df[cat_cols + num_cols], drop_first=True)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
 #   dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
  #  return dataframe

In [None]:
#ohe_cols = [col for col in train_df.columns if 61 >= train_df[col].nunique() and col not in ["isFraud"] ]

In [None]:
# train_df = one_hot_encoder(train_df, ohe_cols)

In [None]:
#test_df = one_hot_encoder(test_df, ohe_cols)

In [None]:
# cat_cols, num_cols, cat_but_car = grab_col_names(train_df)

In [None]:
# num_cols = [col for col in num_cols if "TransactionID" not in col]

# Scale

In [None]:
scaler = MinMaxScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
test_df = pd.DataFrame(scaler.fit_transform(test_df), columns=test_df.columns)

# train_df and test_df must be the same we choose the intersection

In [None]:
train_df.columns.difference(test_df.columns)

In [None]:
test_df.columns.difference(train_df.columns)

In [None]:
col_list = train_df.columns.intersection(test_df.columns)
col_list

In [None]:
frauds = train_df["isFraud"]

In [None]:
train_df = train_df[col_list]
train_df["isFraud"] = frauds
test_df = test_df[col_list]

# Reduce Memory Again

In [None]:
train_df = reduce_mem_usage(train_df)

In [None]:
test_df = reduce_mem_usage(test_df)

# PCA

In [None]:
# from sklearn.decomposition import PCA

In [None]:
# v_features = [x for x in train_df.columns if x.find("V")!=-1]
# v_features

In [None]:
# v_features = v_features[:-2]

In [None]:
# pca = PCA()

In [None]:
# pca.explained_variance_ratio_

In [None]:
# np.cumsum(pca.explained_variance_ratio_)

In [None]:
# pca = PCA(n_components = 9) # 9 bileşen aldık. Açıklayıcılık varyans oranı 0.999

In [None]:
# pca.fit(train_df[v_features])


In [None]:
# pca.fit(test_df[v_features])


In [None]:
# pca = PCA().fit(train_df)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel("Bileşen Sayısını")
# plt.ylabel("Kümülatif Varyans Oranı")
# plt.show()

In [None]:
# pca_frame = pd.DataFrame(pca.transform(train_df[v_features]))


In [None]:
# pca_frame = pd.DataFrame(pca.transform(test_df[v_features]))

In [None]:
# pca_frame.rename(columns = lambda x: "PCA_" + str(x), inplace = True)


In [None]:
# pca_frame.head()

In [None]:
# train_df = pd.concat([train_df, pca_frame], axis = 1)

In [None]:
# train_df.drop(columns=v_features, inplace=True)

In [None]:
# test_df = pd.concat([test_df, pca_frame], axis = 1)

In [None]:
# test_df.drop(columns=v_features, inplace=True)

# Model

In [None]:
y = train_df["isFraud"]
X = train_df.drop(["isFraud"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Random Under Sampling

In [None]:
print("Before UnderSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UnderSampling, counts of label '0': {} \n".format(sum(y_train==0)))
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy=0.2, random_state=3)

X_train_res, y_train_res = undersample.fit_resample(X, y)

print('After UnderSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After UnderSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After UnderSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UnderSampling, counts of label '0': {}".format(sum(y_train_res==0)))

In [None]:
from sklearn.metrics import classification_report
reg3 = LGBMClassifier()   
reg3.fit(X_train_res,y_train_res.ravel()) 


In [None]:
pred3 = reg3.predict(X_test)  
print(classification_report(y_test, pred3))
print(f"Auc: {round(roc_auc_score(pred3, y_test), 2)}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_importance(model, features, num=len(X), save=False):

    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:50])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

        
        
plot_importance(reg3, X_train)

In [None]:
predictions3 = reg3.predict_proba(test_df)[:,1]

In [None]:
test_df_son = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [None]:
submission = pd.DataFrame({'TransactionID':test_df_son['TransactionID'],'isFraud':predictions3})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

In [None]:
filename = 'CIS Fraud Detection over sampling lgbm.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.880898
# Public score: 0.903323

# xgb

In [None]:
from sklearn.metrics import classification_report
reg4 = XGBClassifier()   
reg4.fit(X_train_res,y_train_res.ravel()) 

In [None]:
pred4 = reg4.predict(X_test)  
print(classification_report(y_test, pred4))
print(f"Auc: {round(roc_auc_score(pred4, y_test), 2)}")

In [None]:
plot_importance(reg4, X_train)

In [None]:
predictions4 = reg4.predict_proba(test_df)[:,1]

In [None]:
test_df_son = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [None]:
submission = pd.DataFrame({'TransactionID':test_df_son['TransactionID'],'isFraud':predictions4})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

In [None]:
filename = 'CIS Fraud Detection sampling xgboost.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

In [None]:
# Score: 0.863160
# Public score: 0.890700