In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [None]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')

In [None]:
train_transaction.head()

In [None]:
train_identity.head()

In [None]:
print(train_transaction['isFraud'].value_counts())
plt.figure(figsize=(12,7))
sns.countplot(data = train_transaction, x='isFraud')
plt.title('Fraud vs Normal Transactions', fontdict=dict(fontsize=20))
plt.show()

In [None]:
train_identity.shape

In [None]:
def print_row(row_num):
    c = 0
    while c<394:
        r = c + 51
        if r<394:
            print(train_transaction.iloc[row_num, c:r])
        else:
            print(train_transaction.iloc[row_num, c:])
        c = r - 1

In [None]:
train_transaction.shape

In [None]:
train_transaction.isna().values.any(axis=1).sum()

In [None]:
train_transaction.dtypes

In [None]:
print(train_transaction['ProductCD'].unique())
print(train_transaction['card4'].unique())
print(train_transaction['card6'].unique())
print(train_transaction['P_emaildomain'].unique())
print(train_transaction['R_emaildomain'].unique())
print(train_transaction['M1'].unique())
print(train_transaction['M2'].unique())
print(train_transaction['M3'].unique())
print(train_transaction['M4'].unique())
print(train_transaction['M5'].unique())
print(train_transaction['M6'].unique())
print(train_transaction['M7'].unique())
print(train_transaction['M8'].unique())
print(train_transaction['M9'].unique())

In [None]:
print(train_transaction['M1'].mode())
print(train_transaction['M2'].mode())
print(train_transaction['M3'].mode())
print(train_transaction['M4'].mode())
print(train_transaction['M5'].mode())
print(train_transaction['M6'].mode())
print(train_transaction['M7'].mode())
print(train_transaction['M8'].mode())
print(train_transaction['M9'].mode())

In [None]:
train_transaction.loc[train_transaction['P_emaildomain'] == 'anonymous.com', 'isFraud'].value_counts()

In [None]:
train_merged = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

In [None]:
fraud_df = train_merged.loc[train_transaction['isFraud'] == 1]

In [None]:
print(fraud_df['ProductCD'].unique())
print(fraud_df['card4'].unique())
print(fraud_df['card6'].unique())
print(fraud_df['P_emaildomain'].unique())
print(fraud_df['R_emaildomain'].unique())
print(fraud_df['M1'].unique())
print(fraud_df['M2'].unique())
print(fraud_df['M3'].unique())
print(fraud_df['M4'].unique())
print(fraud_df['M5'].unique())
print(fraud_df['M6'].unique())
print(fraud_df['M7'].unique())
print(fraud_df['M8'].unique())
print(fraud_df['M9'].unique())

In [None]:
train_transaction.head()

In [None]:
train_identity.head()

In [None]:
train_merged.head()

In [None]:
col = train_merged.columns.tolist()
col.remove('P_emaildomain')
col.remove('R_emaildomain')
col.remove('DeviceType')
col.remove('DeviceInfo')
col.remove('isFraud')
col.append('isFraud')
col

In [None]:
train_merged = train_merged[col]

In [None]:
cat_col = train_merged.dtypes[train_merged.dtypes == object].index.tolist()

In [None]:
for c in cat_col:
    print(f'{c:{10}}: ', train_merged[c].unique())

In [None]:
for c in cat_col:
    print(f'{c:{10}}: ', fraud_df[c].unique())

In [None]:
train_merged.loc[train_merged['id_33'] == '0x0']

In [None]:
train_merged.fillna(0, inplace=True)

In [None]:
encoders = {}
for c in cat_col:
    labelencoder = LabelEncoder()
    train_merged[c] = labelencoder.fit_transform(train_merged[c].astype(str))
    encoders[c] = labelencoder

In [None]:
train_merged.head()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_merged.iloc[:,:-1], train_merged.iloc[:,-1],
                                                  stratify=train_merged.iloc[:,-1], test_size=0.2, random_state=42)

In [None]:
del train_identity, train_transaction

In [None]:
nb = GaussianNB()
nb.fit(x_train.values, y_train.values)

In [None]:
nb.class_count_

In [None]:
nb.score(x_val.values, y_val.values)

In [None]:
pred = nb.predict(x_val.values)

In [None]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_val, return_counts=True))


In [None]:
print(compute_class_weight(class_weight='balanced', classes=[0,1], y=y_train))
print(compute_class_weight(class_weight='balanced', classes=[0,1], y=y_val))

In [None]:
np.unique(pred, return_counts=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
rf = joblib.load('./rf.pkl')

In [None]:
rf.score(x_val.values, y_val.values)

In [None]:
pred_rf = rf.predict(x_val.values)

In [None]:
np.unique(pred_rf, return_counts=True)

In [None]:
train_merged.columns.tolist()

In [None]:
test_merged.columns.tolist()

In [None]:
col

In [None]:
cat_col

In [None]:
test_merged.head()

In [None]:
del x_train, x_val, y_train, y_val 

In [None]:
new_cols = [i.replace('_','-') for i in col]
new_cat_col = [i.replace('_','-') for i in cat_col]
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
test_merged = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test_merged = test_merged[new_cols[:-1]]
del test_transaction, test_identity
test_merged.fillna(0, inplace=True)

In [None]:
for c,o_c in zip(new_cat_col,cat_col):
    print(c,o_c)
    test_merged.loc[~test_merged[c].isin(encoders[o_c].classes_), c] = 0
    test_merged[c] = encoders[o_c].transform(test_merged[c].astype(str))
pred = rf.predict_proba(test_merged.values)

In [None]:
pred = rf.predict_proba(test_merged.values)

In [None]:
pred[:,1]

In [None]:
pd.DataFrame({'TransactionID': test_merged['TransactionID'], 'isFraud':pred[:,1]}).to_csv('submission.csv', index=False)