In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 500)

#matplotlib inline

## Handy Functions

In [None]:
# To calculate missing values

def missing_vals(df):
    missing_vals = df.isnull().sum()
    missing_per = missing_vals/len(train)*100
    missing_per = missing_per.sort_values(ascending=False).reset_index()
    missing_table = missing_per.rename({'index':'Column', 0:'Missing %'}, axis=1)
    return missing_table


# To draw data insights

def data_insights(df):
    
    print(f'Dataset Shape : {df.shape}')

    summary = pd.DataFrame(df.dtypes).reset_index().rename({'index':"Column", 0:'DataType'}, axis=1)
    summary['Missing %'] = round((df.isnull().sum()/df.shape[0])*100,2).values
    summary['No. of Unique Values'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values

    return summary

#correlation matrix

def cormat(df):
    cols = df.columns
    corrmat = df[cols].corr()
    f, ax = plt.subplots(figsize=(14,10))
    summary = sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.2f')
    return summary
    

# Outliers detection

def CalcOutliers(df_num): 

    # calculating mean and std of the array
    data_mean, data_std = np.mean(df_num), np.std(df_num)

    # seting the cut line to both higher and lower values
    # You can change this value
    cut = data_std * 3

    #Calculating the higher and lower cut values
    lower, upper = data_mean - cut, data_mean + cut

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    outliers_removed.sort()
    
    print(f"Lowest Value : {outliers_removed[0]}") # printing lowest value
    print(f"Highest Value : {outliers_removed[-1]}") # printing highest value
    print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
    print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
    print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
    print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
    print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
    
    return 

## Data Glimpse

In [None]:
# Train Dataset
train_trans = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")
train_id = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_identity.csv")

In [None]:
# Test Dataset
test_trans = pd.read_csv("/kaggle/input/ieee-fraud-detection/test_transaction.csv")
test_id = pd.read_csv("/kaggle/input/ieee-fraud-detection/test_identity.csv")

In [None]:
id_train = [col for col in train_id.columns if "id" in col]
id_test = [col for col in test_id.columns if "id" in col]
col_map = dict(zip(id_test, id_train))

In [None]:
test_id = test_id.rename(col_map, axis=1)

In [None]:
train = train_trans.merge(train_id, how='left', on="TransactionID")
train.head()

In [None]:
test = test_trans.merge(test_id, how='left', on="TransactionID")
test.head()

In [None]:
print(f"Train dataset has {train.shape[0]} rows and {train.shape[1]} columns")
print(f"There are {train.isnull().any().sum()} columns which contains NULL values")

Most of columns have missing data, which is normal in real world.

## Missing Values

In [None]:
missing_table = missing_vals(train)
missing_table

In [None]:
missing_num = missing_table[missing_table['Missing %']>50].shape[0]

print(f"There are {missing_num} columns where missing % is greater than 50%")

## Train and Test Distribution

In [None]:
plt.figure(figsize=(10,5))

train_trans['TransactionDT'].plot(kind='hist',title='Train vs Test TransactionDT distribution',
                                 xlabel='Train', label='Train')

test_trans['TransactionDT'].plot(kind='hist',
                                 xlabel='Test', label='Test')

plt.legend()
plt.tight_layout()

The TransactionDT feature is a timedelta from a given reference datetime (not an actual timestamp). It seems like the train and test are splitted by time. There is a slight gap inbetween.
The training set is from an earlier period of time and test is from a later period of time. This will act as a key reason while choosing the right cross validation techniques later.

## Target Distribution

In [None]:
plt.subplots(1,2, figsize=(14,5))

plt.subplot(1,2,1)
sns.countplot(x="isFraud", data=train)
plt.title("Target Variable Count Distribution", fontsize=16, loc='center')

plt.subplot(1,2,2)
train.groupby('isFraud')['TransactionAmt'].sum().plot(kind='bar')
plt.title("Transaction Amount Sum Distribution by Target Variable", fontsize=16, loc='center')

plt.tight_layout()
plt.show(),
train.isFraud.value_counts(normalize=True)*100 #Imbalance dataset

We have 3.5% of Fraud transactions in our dataset.

## TransactionAmt Distribution

In [None]:
data_insights(train[['TransactionAmt']])

In [None]:
plt.subplots(2,2, figsize=(16,12))
plt.suptitle('Transaction Values Distribution', fontsize=22)

plt.subplot(2,2,1)
train["TransactionAmt"].plot()
plt.title("Transaction Amount Distribution")

plt.subplot(2,2,2)
sns.boxplot(y="TransactionAmt", data=train[train["TransactionAmt"]<1000])
plt.title("Transaction Amount <1000 Outliers Check")

plt.subplot(2,2,3)
plt.scatter(range(train[train['isFraud'] == 0].shape[0]),
                 np.sort(train[train['isFraud'] == 0]['TransactionAmt'].values),label='NoFraud')
plt.title("Transaction Amount of Non-Fraud Entries")

plt.subplot(2,2,4)
plt.scatter(range(train[train['isFraud'] == 1].shape[0]),
                 np.sort(train[train['isFraud'] == 1]['TransactionAmt'].values),label='NoFraud')
plt.title("Transaction Amount of Fraud Entries")

plt.tight_layout()

In [None]:
plt.subplots(2,2, figsize=(14,5))
plt.suptitle('Train Transaction Amount Distribution', fontsize=16)

plt.subplot(221)
train.loc[train["isFraud"]==1]["TransactionAmt"].plot(kind='hist', bins=100, title="Fraud Distribution")

plt.subplot(222)
train.loc[train["isFraud"]==1]["TransactionAmt"].apply(np.log)\
                        .plot(kind='hist', bins=100, title="Log Transaformed Fraud Distribution")

plt.subplot(223)
train.loc[train["isFraud"]==0]["TransactionAmt"]\
                        .plot(kind='hist', bins=100, title="Non-Fraud Distribution")

plt.subplot(224)
train.loc[train["isFraud"]==0]["TransactionAmt"].apply(np.log)\
                        .plot(kind='hist', bins=100, title="Log Transaformed Non-Fraud Distribution")

plt.tight_layout()
plt.show()

## Outlier Detection 

In [None]:
CalcOutliers(train['TransactionAmt']), train["TransactionAmt"].describe()

If we consider only values between >= 0 to 800 we will avoid the outliers and has more confidence in our distribution.
We have 10k rows with outliers that represents 1.74% of total rows.

## Product Code Features

In [None]:
data_insights(train[['ProductCD']])

In [None]:
df_prod = train.groupby("ProductCD")['isFraud'].sum().reset_index()
df_prod1 = (pd.crosstab(index=train["ProductCD"], columns=train['isFraud'], normalize='index')*100).reset_index()

fig, ax1 = plt.subplots(figsize=(14,7))
plt.suptitle("Fraud Transactions by Product Code", fontsize=16)

color = 'tab:red'
ax1.set_xlabel('Product Code')
ax1.set_ylabel('Number of Fraud Transactions', color=color)
ax1.plot(df_prod.ProductCD, df_prod['isFraud'], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
ax3 = ax1.twinx() # instantiate a second axes that shares the same x-axis

ax3.spines.right.set_position(("axes", 1.2))

color = 'tab:blue'
ax2.set_ylabel('% of Non Fraud Transactions', color=color)  # we already handled the x-label with ax1
ax2.plot(df_prod1.ProductCD, df_prod1[0], color=color, label="Non Fraud")
ax2.tick_params(axis='y', labelcolor=color)

color = 'tab:green'
ax3.set_ylabel('% of Fraud Transactions', color=color)  # we already handled the x-label with ax1
ax3.plot(df_prod1.ProductCD, df_prod1[1], color=color, label="Fraud")
ax3.tick_params(axis='y', labelcolor=color)

ax2.legend()
ax3.legend()

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
df = pd.crosstab(index=train['ProductCD'], columns=train['isFraud'], normalize='columns') * 100
df = df.reset_index()
df.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(
    x=df["ProductCD"],
    y=df["NoFraud"],
    name='NoFraud',
    marker_color='indianred',
    text=df["NoFraud"]
))

fig.add_trace(go.Bar(
    x=df["ProductCD"],
    y=df["Fraud"],
    name='Fraud',
    marker_color='lightsalmon',
    text=df["Fraud"]
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45, title='% of Fraud Transactions by Product')
fig.update_traces(texttemplate='%{text:.2s}%', textposition='outside')
fig.show()

> W has the most Fraud and Non Fraud transactions, followed by C and R.

> ProductCD C has the most fraud with >11%

> ProductCD W has the least with ~2%

In [None]:
plt.figure(figsize=(14,7))
sns.boxplot(x='ProductCD', y='TransactionAmt', data=train[train['TransactionAmt']<1000], hue='isFraud')

plt.tight_layout()

## Card Features

In [None]:
card_cols = [c for c in train.columns if 'card' in c]

# Card dataset insights
data_insights(train[card_cols])

In [None]:
train[card_cols].describe()

### Non Numerical Features

In [None]:
train_df = pd.crosstab(index=train['card4'], columns=train['isFraud'], normalize='index')

x = np.arange(len(train_df.index))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(12,5))
rects1 = ax.bar(x - width/2, round(train_df[0]*100,2), width, label='Not Fraud')
rects2 = ax.bar(x + width/2, round(train_df[1]*100,2), width, label='Fraud')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage')
ax.set_title('Percentage by Card 4 and isFraud')
ax.set_xticks(x)
ax.set_xticklabels(train_df.index)
ax.legend()

ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)

fig.tight_layout()
plt.show()

In [None]:
plt.subplots(2,2, figsize=(18,12))
plt.suptitle("Cards Distribution", fontsize=22)

plt.subplot(2,2,1)
sns.countplot(train['card4'])
plt.title("Card 4 Distribution")

plt.subplot(2,2,2)
train.groupby('card4')['TransactionAmt'].sum().plot(kind='bar')
plt.title("Card 4 Distribution by Transacton Amount")

plt.subplot(2,2,3)
sns.countplot(train['card6'])
plt.title("Card 6 Distribution")

plt.subplot(2,2,4)
train.groupby('card6')['TransactionAmt'].sum().plot(kind='bar')
plt.title("Card 6 Distribution by Transacton Amount")

plt.tight_layout()
plt.show()

### Numerical Features

In [None]:
df_c3 = pd.crosstab(index=train['card3'], columns=train['isFraud'], normalize='index').reset_index()
df_c3 = df_c3.sort_values(by=1, ascending=False).head(30)

df_c5 = pd.crosstab(index=train['card5'], columns=train['isFraud'], normalize='index').reset_index()
df_c5 = df_c5.sort_values(by=1, ascending=False).head(30)


plt.subplots(figsize=(16,7))

plt.subplot(211)
sns.pointplot(x='card3',y=1, data=df_c3, )
plt.title('Top 30 Fraudlant Transactons by Card 3', fontsize=16)
plt.ylabel("% of Fraudlant Transactons", fontsize=14)
plt.xlabel("card 3 Values", fontsize=14)

plt.subplot(212)
sns.pointplot(x='card5',y=1, data=df_c5)
plt.title('Top 30 Fraudlant Transactons by Card 5', fontsize=16)
plt.ylabel("% of Fraudlant Transactons", fontsize=14)
plt.xlabel("card 5 Values", fontsize=14)


plt.tight_layout()
plt.show()

## C1-C14 Features

In [None]:
c_cols = [c for c in train.columns if c[0]=='C']
data_insights(train[c_cols])

In [None]:
#correlation matrix
cormat(train[c_cols])

You can observe that there is a high correlation among C3 features. Example below:

> C1, C2, C4, C6, C7, C8, C10, C11, C12, C14 are highly correlated with each other. We can keep one of them and drop the rest.

In [None]:
plt.subplots(3,5, figsize=(18,14))

x=1
for c in c_cols:
    plt.subplot(5,3,x)
    sns.kdeplot(train[c])
    plt.title(f"{c}'s Density Distribution")
    x+=1
    
plt.tight_layout()
plt.show()

Clearly there are outliers present in each of the C# columns.

In [None]:
train_C = train[c_cols]
topC1_index = list(train_C.C1.value_counts(normalize=True, sort=True).head(10).index)
C1_df = (pd.crosstab(index=train['C1'], columns=train['isFraud'], normalize=True)*100).reset_index()
C1_df = C1_df[C1_df['C1'].isin(topC1_index)]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=C1_df['C1'],
    x=C1_df[0],
    name='Not Fraud',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    )
))
fig.add_trace(go.Bar(
    y=C1_df['C1'],
    x=C1_df[1],
    name='Fraud',
    orientation='h',
    marker=dict(
        color='rgba(58, 71, 80, 0.6)',
        line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
    )
))

fig.update_layout(barmode='stack', title='Top 10 most frequent values in C1 with their Fraud percentage')
fig.show()

## D1-D15 Features

In [None]:
d_cols = [c for c in train.columns if c[0]=='D' and len(c)<5]
data_insights(train[d_cols])

In [None]:
plt.subplots(3,5, figsize=(18,14))

x=1
for c in d_cols:
    plt.subplot(5,3,x)
    sns.kdeplot(train[c])
    plt.title(f"{c}'s Density Distribution")
    x+=1
    
plt.tight_layout()
plt.show()

In [None]:
cormat(train[d_cols])

- D1, D2 are highly correlated with each other. We can keep one of them and drop the other.
- Also, a lot of missing values, we will treat them later.

In [None]:
train[d_cols].describe()

## M1-M9 Columns

In [None]:
m_cols = [c for c in train.columns if c[0]=='M']
train_M = train[m_cols]
data_insights(train_M)

In [None]:
[train_M[c].unique() for c in train_M.columns]

In [None]:
train_M_plot = train_M.fillna("None")

plt.subplots(nrows = 3, ncols=3, figsize=(18,14))

x=1
for c in list(train_M_plot.columns):
    
    plt.subplot(3,3,x)
    sns.countplot(x = c, data= train_M_plot, label=c)
    plt.title("Distinct Value Counts across "+ c + " Column" )
    x+=1

plt.tight_layout()
plt.show()

## V1-V339 Columns

In [None]:
v_cols = [c for c in train.columns if c[0]=='V']
train_V = train[v_cols]
data_insights(train_V).head(5)

In [None]:
train_V.describe()

## id# Columns

In [None]:
id_cols = [c for c in train.columns if 'id' in c]
train_id = train.loc[:,"id_01":"id_38"]
data_insights(train_id)

There are a lot of missing values in M# columns. 

In [None]:
train_id = pd.DataFrame(train[id_cols].dtypes).reset_index()\
        .rename({'index':'column', 0:'Dtype'}, axis=1)

train_id.groupby('Dtype')['column'].count()

In [None]:
train.loc[:,"id_01":"id_38"].describe()

In [None]:
train_id_plot = train[id_cols]

plt.subplots(6, 4, figsize=(18,14))

x=1
for c in list(train_id[train_id["Dtype"]== 'float64'].column.unique()):
    plt.subplot(6,4,x)
    plt.hist(train_id_plot[c])
    plt.title(f'Distribution of {c} variable')
    x+=1

plt.tight_layout()
plt.show()

In [None]:
lst = ['id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29','id_34', 'id_35', 'id_36', 'id_37','id_38']

plt.subplots(4, 3, figsize=(18,14))

x=1
for c in lst:
    plt.subplot(4,3,x)
    sns.countplot(x= c, data = train_id_plot)
    plt.title(f'Distribution of {c} variable')
    x+=1

plt.tight_layout()
plt.show()

In [None]:
train['id_30'].value_counts().plot(kind='bar', figsize=(18,7))

### Identity info as a function of Transaction Date

In [None]:
id_colss = [c for c in test.columns if 'id' in c]
id_colss_ = [c.replace('-','_') for c in id_colss]

dictionary = dict(zip(id_colss, id_colss_))
test.rename(columns=dictionary,inplace=True)

In [None]:
p = data_insights(train_id)
lst = list(p[p['DataType']!='object'].Column)

## Device Features

In [None]:
sns.countplot(x='DeviceType',data=train)

In [None]:
pie_frame = pd.DataFrame((train['DeviceInfo'].value_counts(normalize=True))*100).reset_index()
fig = px.pie(pie_frame.head(10), values='DeviceInfo', names='index', title='Top 10 Device Infomation %')
fig.show()

In [None]:
R_frame = pd.DataFrame(train['R_emaildomain'].value_counts(normalize=True)*100).reset_index().head()
P_frame = pd.DataFrame(train['P_emaildomain'].value_counts(normalize=True)*100).reset_index().head()

fig1 = px.pie(R_frame.head(10), values='R_emaildomain', names='index', title='R_emaildomain Distribution %')
fig2 = px.pie(P_frame.head(10), values='P_emaildomain', names='index', title='P_emaildomain Distribution %')
fig1.show()
fig2.show()

In [None]:
missing_vals(train[['R_emaildomain','P_emaildomain']])

In [None]:
R_fraud_pct = (pd.crosstab(index=train['R_emaildomain'], columns=train['isFraud'], normalize='index')*100).reset_index()\
                .rename({0:'Not Fraud', 1:'Fraud'}, axis=1)

fig = px.bar(R_fraud_pct, x="R_emaildomain", y=['Not Fraud','Fraud'], title="Fraud % by R_email domain")
fig.show()

In [None]:
P_fraud_pct = (pd.crosstab(index=train['P_emaildomain'], columns=train['isFraud'], normalize='index')*100).reset_index()\
                .rename({0:'Not Fraud', 1:'Fraud'}, axis=1)

fig = px.bar(P_fraud_pct, x="P_emaildomain", y=['Not Fraud','Fraud'], title="Fraud % by P_email domain")
fig.show()

## Transaction Date

In [None]:
# Reference - https://www.kaggle.com/c/ieee-fraud-detection/discussion/100071#latest-577632

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
train["Date"] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
train['_Weekdays'] = train['Date'].dt.dayofweek
train['_Hours'] = train['Date'].dt.hour
train['_Days'] = train['Date'].dt.day

## Feature Engineering

https://www.kaggle.com/artgor/eda-and-models#Data-Exploration
I have referenced this amazing kernels where I could see some amazing feature transformations.

In [None]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

In [None]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

In [None]:
# train_miss = missing_vals(train)
# many_null_cols = train_miss[train_miss['Missing %']>90].Column.to_list()

# test_miss = missing_vals(test)
# many_null_cols_test =  test_miss[test_miss['Missing %']>90].Column.to_list()

# big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
# big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

# one_value_cols = [col for col in train.columns if train[col].nunique()<=1]
# one_value_cols_test = [col for col in test.columns if test[col].nunique()<=1]

# cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols + one_value_cols_test))
# cols_to_drop.remove("isFraud")
# print(f"we will drop {len(cols_to_drop)} columns from our test and train data")

# train = train.drop(cols_to_drop, axis=1)
# test = test.drop(cols_to_drop, axis=1)

In [None]:
*Now, we can use this data to feed into various models.* 

## Thank You ##