In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.exceptions import ConvergenceWarning

# Load transaction dataset

In [4]:
df1 = pd.read_csv('train_transaction.csv')

In [5]:
df1.shape

(590540, 394)

In [6]:
df1.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB


# Load identity dataset

In [8]:
df2 = pd.read_csv('train_identity.csv')

In [9]:
df2.shape

(144233, 41)

# Merge both dataset

In [10]:
df3 = pd.merge(df1, df2, how='left')

In [11]:
df3.shape

(590540, 434)

In [12]:
del df1,df2

In [13]:
df3.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


# Missing Values

In [14]:
def null_percentage(df):
    output = round(df.isnull().sum()/len(df.index)*100,2)
    return output

In [15]:
Na_col = null_percentage(df3)
Na_col

TransactionID      0.00
isFraud            0.00
TransactionDT      0.00
TransactionAmt     0.00
ProductCD          0.00
                  ...  
id_36             76.13
id_37             76.13
id_38             76.13
DeviceType        76.16
DeviceInfo        79.91
Length: 434, dtype: float64

In [16]:
Na_col40 = Na_col[Na_col>40]

Na_col40

dist1            59.65
dist2            93.63
R_emaildomain    76.75
D2               47.55
D3               44.51
                 ...  
id_36            76.13
id_37            76.13
id_38            76.13
DeviceType       76.16
DeviceInfo       79.91
Length: 232, dtype: float64

### drop columns with more than 40% null values

In [17]:
new_df = df3.drop(Na_col40.index,axis = 1)

In [18]:
new_df.shape

(590540, 202)

In [19]:
l = new_df.columns

In [20]:
cols = []
for i in l:
    cols.append(i)
cols

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'P_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D4',
 'D10',
 'D15',
 'M6',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 

### get the null values in all columns

In [21]:
nullvalues = []
for i in l:
    print(i ,'--->',new_df[i].isnull().sum())

TransactionID ---> 0
isFraud ---> 0
TransactionDT ---> 0
TransactionAmt ---> 0
ProductCD ---> 0
card1 ---> 0
card2 ---> 8933
card3 ---> 1565
card4 ---> 1577
card5 ---> 4259
card6 ---> 1571
addr1 ---> 65706
addr2 ---> 65706
P_emaildomain ---> 94456
C1 ---> 0
C2 ---> 0
C3 ---> 0
C4 ---> 0
C5 ---> 0
C6 ---> 0
C7 ---> 0
C8 ---> 0
C9 ---> 0
C10 ---> 0
C11 ---> 0
C12 ---> 0
C13 ---> 0
C14 ---> 0
D1 ---> 1269
D4 ---> 168922
D10 ---> 76022
D15 ---> 89113
M6 ---> 169360
V12 ---> 76073
V13 ---> 76073
V14 ---> 76073
V15 ---> 76073
V16 ---> 76073
V17 ---> 76073
V18 ---> 76073
V19 ---> 76073
V20 ---> 76073
V21 ---> 76073
V22 ---> 76073
V23 ---> 76073
V24 ---> 76073
V25 ---> 76073
V26 ---> 76073
V27 ---> 76073
V28 ---> 76073
V29 ---> 76073
V30 ---> 76073
V31 ---> 76073
V32 ---> 76073
V33 ---> 76073
V34 ---> 76073
V35 ---> 168969
V36 ---> 168969
V37 ---> 168969
V38 ---> 168969
V39 ---> 168969
V40 ---> 168969
V41 ---> 168969
V42 ---> 168969
V43 ---> 168969
V44 ---> 168969
V45 ---> 168969
V46 ---> 1689

In [22]:
new_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Handle Missing Values 

In [23]:
from sklearn.impute import SimpleImputer

In [24]:
# for card2 column (mean)

In [25]:
imputer = SimpleImputer(strategy='mean')  # Other strategies: 'median', 'most_frequent', 'constant'
new_df['card2'] = imputer.fit_transform(new_df[['card2']])

In [26]:
# for card3 column (median)

In [27]:
imputer = SimpleImputer(strategy='median')  # Other strategies: 'median', 'most_frequent', 'constant'
new_df['card3'] = imputer.fit_transform(new_df[['card3']])

In [28]:
# for card4 column(mode)

In [29]:
new_df.card4.value_counts()

visa                384767
mastercard          189217
american express      8328
discover              6651
Name: card4, dtype: int64

In [30]:
mode = new_df['card4'].mode()[0]
new_df['card4'] = new_df['card4'].fillna(mode)

In [31]:
# for card5 column (median)

In [32]:
new_df.card5.value_counts()

226.0    296546
224.0     81513
166.0     57140
102.0     29105
117.0     25941
          ...  
221.0         1
167.0         1
115.0         1
165.0         1
234.0         1
Name: card5, Length: 119, dtype: int64

In [33]:
imputer = SimpleImputer(strategy='median')  # Other strategies: 'median', 'most_frequent', 'constant'
new_df['card5'] = imputer.fit_transform(new_df[['card5']])

In [34]:
# for card6 column (mode)

In [35]:
new_df.card6.value_counts()

debit              439938
credit             148986
debit or credit        30
charge card            15
Name: card6, dtype: int64

In [36]:
mode = new_df['card6'].mode()[0]
new_df['card6'] = new_df['card6'].fillna(mode)

In [37]:
# for addr1 and addr2 (mean)

In [38]:
columns_to_impute = ['addr1', 'addr2']

# Initialize the SimpleImputer with the desired strategy
imputer = SimpleImputer(strategy='mean')  # Other strategies: 'median', 'most_frequent', 'constant'

# Apply the imputer to the selected columns
new_df[columns_to_impute] = imputer.fit_transform(new_df[columns_to_impute])

In [39]:
# for P_emaildomain column

In [40]:
new_df.P_emaildomain.value_counts

<bound method IndexOpsMixin.value_counts of 0                 NaN
1           gmail.com
2         outlook.com
3           yahoo.com
4           gmail.com
             ...     
590535            NaN
590536      gmail.com
590537      gmail.com
590538        aol.com
590539      gmail.com
Name: P_emaildomain, Length: 590540, dtype: object>

In [41]:
mode = new_df['P_emaildomain'].mode()[0]
new_df['P_emaildomain'] = new_df['P_emaildomain'].fillna(mode)

In [42]:
# for column D1,D10,D15 (median)

In [43]:
columns_to_impute = ['D1', 'D10','D15']

# Initialize the SimpleImputer with the desired strategy
imputer = SimpleImputer(strategy='median')  # Other strategies: 'median', 'most_frequent', 'constant'

# Apply the imputer to the selected columns
new_df[columns_to_impute] = imputer.fit_transform(new_df[columns_to_impute])

In [44]:
# for V columns

In [45]:
v_columns = [col for col in new_df.columns if col.startswith("V")]
v_df = new_df[v_columns]

In [46]:
v_df.head()

Unnamed: 0,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
columns_to_impute = v_df.columns

# Initialize the SimpleImputer with the desired strategy
imputer = SimpleImputer(strategy='median')  # Other strategies: 'median', 'most_frequent', 'constant'

# Apply the imputer to the selected columns
v_df[columns_to_impute] = imputer.fit_transform(v_df[columns_to_impute])

In [48]:
# drop columns starting from V from org df

In [49]:
v_columns = [col for col in new_df.columns if col.startswith("V")]

# Drop the columns starting with "V"
new_df.drop(columns=v_columns,axis=1,inplace=True)

In [50]:
# merge both df

In [51]:
new_df = pd.concat([new_df,v_df], axis=1)

In [52]:
# D4

In [53]:
imputer = SimpleImputer(strategy='mean')  # Other strategies: 'median', 'most_frequent', 'constant'
new_df['D4'] = imputer.fit_transform(new_df[['D4']])

In [54]:
# M6

In [55]:
new_df.M6.value_counts

<bound method IndexOpsMixin.value_counts of 0           T
1           T
2           F
3           F
4         NaN
         ... 
590535      F
590536      T
590537      T
590538      T
590539      T
Name: M6, Length: 590540, dtype: object>

In [56]:
mode = new_df['M6'].mode()[0]
new_df['M6'] = new_df['M6'].fillna(mode)

# Label Encoder

In [57]:
new_df.select_dtypes(include=['object'])

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,M6
0,W,discover,credit,gmail.com,T
1,W,mastercard,credit,gmail.com,T
2,W,visa,debit,outlook.com,F
3,W,mastercard,debit,yahoo.com,F
4,H,mastercard,credit,gmail.com,F
...,...,...,...,...,...
590535,W,visa,debit,gmail.com,F
590536,W,mastercard,debit,gmail.com,T
590537,W,mastercard,debit,gmail.com,T
590538,W,mastercard,debit,aol.com,T


In [58]:
from sklearn.preprocessing import LabelEncoder

In [59]:
label_encoder = LabelEncoder()

# Specify columns to label encode
columns_to_encode = ['P_emaildomain']

# Apply label encoding to the specified columns
for column in columns_to_encode:
    new_df[column] = label_encoder.fit_transform(new_df[column])

In [60]:

# Specify columns to one-hot encode
columns_to_encode = ['ProductCD', 'card4', 'card6','M6']

# Apply one-hot encoding to the specified columns
new_df = pd.get_dummies(new_df, columns=columns_to_encode, prefix=columns_to_encode)

# Anova 

In [61]:
new_cpy = new_df

In [62]:
new_cpy.shape

(590540, 213)

In [63]:
fraud_group = new_df[new_df['isFraud'] == 1]
non_fraud_group = new_df[new_df['isFraud'] == 0]

In [64]:
from scipy.stats import f_oneway

selected_features = []

for feature in new_df.columns:
    if feature != 'isFraud':
        f_statistic, p_value = f_oneway(fraud_group[feature], non_fraud_group[feature])
        
        if p_value < 0.05:  # You can adjust the significance level
            selected_features.append(feature)

print("Selected Features:", selected_features)

Selected Features: ['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'P_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D4', 'D10', 'D15', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V11

In [65]:
len(selected_features)

190

In [66]:
new_df = new_df[selected_features]

In [67]:
new_df.shape

(590540, 190)

In [68]:
new_df['isFraud'] = new_cpy['isFraud']

In [69]:
new_df.shape

(590540, 191)

In [70]:
new_df.drop('TransactionID',axis=1,inplace=True)

# SMOTE 

In [71]:
from imblearn.over_sampling import SMOTE

In [72]:
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [73]:
new_df.isFraud.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

In [74]:
x = new_df.drop('isFraud', axis=1)
y = new_df['isFraud']

In [75]:
x_sm, y_sm = smote.fit_resample(x, y)

In [76]:
y.value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

In [77]:
y_sm.value_counts()

0    569877
1    569877
Name: isFraud, dtype: int64

# Feature Scaling

In [78]:
from sklearn.preprocessing import StandardScaler

In [79]:
scaler = StandardScaler()

In [80]:
x_sm = scaler.fit_transform(x_sm)

In [81]:
x_sm

array([[-1.64370083, -0.32341659,  0.88965729, ..., -1.08850307,
        -1.54592863,  2.05044688],
       [-1.6437006 , -0.50854508, -1.47535936, ..., -1.08850307,
        -1.54592863,  2.05044688],
       [-1.64368558, -0.36794116, -1.07141595, ...,  0.91869286,
         0.64686039, -0.48769856],
       ...,
       [-0.28742106, -0.37596041,  1.6753145 , ..., -1.08850307,
         0.64686039, -0.48769856],
       [ 1.75514519,  4.5297619 ,  0.4575564 , ..., -1.08850307,
         0.64686039, -0.48769856],
       [-0.51737327, -0.41774037,  0.13131071, ..., -1.08850307,
        -1.54592863, -0.48769856]])

# Train test split 

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size=0.25)

# Logistic Reg 

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [85]:
model = LogisticRegression()

model.fit(x_train, y_train)

In [86]:
y_pred = model.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8949354072275119
Confusion Matrix:
 [[135799   6926]
 [ 23011 119203]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90    142725
           1       0.95      0.84      0.89    142214

    accuracy                           0.89    284939
   macro avg       0.90      0.89      0.89    284939
weighted avg       0.90      0.89      0.89    284939



# Decision tree

In [87]:
from sklearn.tree import DecisionTreeClassifier

In [88]:
model = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
model.fit(x_train, y_train)

In [89]:
y_pred = model.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9798377898427383
Confusion Matrix:
 [[139502   3223]
 [  2522 139692]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98    142725
           1       0.98      0.98      0.98    142214

    accuracy                           0.98    284939
   macro avg       0.98      0.98      0.98    284939
weighted avg       0.98      0.98      0.98    284939



# XGB 

In [90]:
import xgboost as xgb

In [91]:
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    learning_rate=0.1,
    max_depth=3,
    n_estimators=100
)

In [92]:
xgb_classifier.fit(x_train, y_train)

In [93]:
y_pred = xgb_classifier.predict(x_test)

In [94]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9684142921818354
Confusion Matrix:
 [[140101   2624]
 [  6376 135838]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97    142725
           1       0.98      0.96      0.97    142214

    accuracy                           0.97    284939
   macro avg       0.97      0.97      0.97    284939
weighted avg       0.97      0.97      0.97    284939

