In [65]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
import warnings
warnings.filterwarnings("ignore")

In [67]:
train_trans = pd.read_csv("train_transaction.csv")
train_ident = pd.read_csv("train_identity.csv")

test_trans = pd.read_csv("test_transaction.csv")
test_ident = pd.read_csv("test_identity.csv")

train = train_trans.merge(train_ident, on = "TransactionID", how = "left")
test = test_trans.merge(test_ident, on = "TransactionID", how = "left")

In [68]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [69]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [70]:
train["isFraud"].value_counts(normalize=True)

isFraud
0    0.96501
1    0.03499
Name: proportion, dtype: float64

In [71]:
train.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

In [72]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # for nan

In [74]:
high_null_cols = train.columns[train.isnull().mean() > 0.90]
train.drop(columns = high_null_cols, inplace = True)
test.drop(columns = high_null_cols, inplace = True)

In [75]:
y = train["isFraud"].values
X = train.drop(columns = ["isFraud", "TransactionID"])

In [76]:
numeric_features = X.select_dtypes(include = ["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include = ["object"]).columns.tolist()

In [77]:
low_cardinality_cats = [c for c in categorical_features if X[c].nunique() < 50]
high_cardinality_cats = [c for c in categorical_features if c not in low_card_cats]

X = X.drop(columns = high_cardinality_cats)

numeric_features = X.select_dtypes(include = ["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [79]:
numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("onehot", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(
    transformers = [
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder = "drop"
)

In [80]:
pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter = 1000, class_weight="balanced"))
])

In [81]:
pipeline.fit(X_train, y_train)

In [82]:
y_predictions = pipeline.predict(X_test)

In [83]:
print(classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

           0       0.99      0.84      0.91    113975
           1       0.14      0.73      0.23      4133

    accuracy                           0.83    118108
   macro avg       0.56      0.78      0.57    118108
weighted avg       0.96      0.83      0.88    118108



In [84]:
confusion_matrix(y_test, y_predictions)

array([[95222, 18753],
       [ 1118,  3015]])

In [85]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_predictions)

0.7824791157916792