## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings("ignore")

## Loading data

In [2]:
train = pd.read_csv(r"../data/train.csv")
test = pd.read_csv(r"../data/test.csv")

train.columns = [col.strip().lower() for col in train.columns]
test.columns = [col.strip().lower() for col in test.columns]

### Submission 1: Autoencoders

In [3]:
X_train = train.drop(['class'], axis=1)
y_train = train['class']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test)

X_train_non_fraud = X_train_scaled[y_train == 0]

In [4]:
input_dim = X_train_non_fraud.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="tanh")(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder.fit(X_train_non_fraud, X_train_non_fraud,
                epochs=100,
                batch_size=256,
                shuffle=True,
                validation_split=0.2,
                verbose=1)

Epoch 1/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - loss: 0.8680 - val_loss: 1.0224
Epoch 2/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.7445 - val_loss: 0.9818
Epoch 3/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.7136 - val_loss: 0.9593
Epoch 4/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.6956 - val_loss: 0.9419
Epoch 5/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.6832 - val_loss: 0.9322
Epoch 6/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.6992 - val_loss: 0.9280
Epoch 7/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.6774 - val_loss: 0.9244
Epoch 8/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.6701 - val_loss: 0.9194
Epoch 9/100
[1m533/533[0m [

<keras.src.callbacks.history.History at 0x2b3cda4a860>

In [5]:
test_pred = autoencoder.predict(X_test_scaled)
mse_test = np.mean(np.power(X_test_scaled - test_pred, 2), axis=1)

mse_min, mse_max = mse_test.min(), mse_test.max()
fraud_prob = (mse_test - mse_min) / (mse_max - mse_min)

[1m3561/3561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step


In [6]:
test['Class'] = fraud_prob.round(2)
submission = test[['id', 'Class']]
submission.to_csv(r"../data/submission_with_autoencoder.csv", index=False)

### Submission 2: Random Forest Classifier (with best parameters)

In [7]:
X = train.drop('class', axis=1)
y = train['class']

#### Submission 2.1. RFC {max_depth=10, n_estimators=100, random_state=42}

In [8]:
clf1 = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42)
clf1.fit(X, y)

sub1 = test.copy(deep=True)
submission_prob = clf1.predict_proba(sub1)[:, 1]
sub1['Class'] = submission_prob.round(2)
submission1 = sub1[['id','Class']]
submission1.to_csv(r"../data/second_iteration_submission1.csv",index=False)

#### Submission 2.2. RFC {class_weight='balanced', max_depth=10, n_estimators=100, random_state=42}

In [None]:
clf2 = RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=100, random_state=42)
clf2.fit(X, y)

sub2 = test.copy(deep=True)
submission_prob = clf2.predict_proba(sub2)[:, 1]
sub2['Class'] = submission_prob.round(2)
submission2 = sub2[['id','Class']]
submission2.to_csv(r"../data/second_iteration_submission2.csv",index=False)

#### Submission 2.3. RFC {class_weight='balanced', max_depth=10, n_estimators=200, random_state=42}

In [None]:
clf3 = RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=200, random_state=42)
clf3.fit(X, y)

sub3 = test.copy(deep=True)
submission_prob = clf3.predict_proba(sub3)[:, 1]
sub3['Class'] = submission_prob.round(2)
submission3 = sub3[['id','Class']]
submission3.to_csv(r"../data/second_iteration_submission3.csv",index=False)

### Submission 3: Random Forest Classifier (with oversampling the unbalanced data using SMOTE)

In [None]:
X = train.drop('class', axis=1)
y = train['class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print(pd.Series(y_resampled).value_counts())

In [None]:
rf = RandomForestClassifier(random_state=42, 
                            max_depth=10,
                            min_samples_leaf=4,
                            min_samples_split=10,
                            n_estimators=100)

rf.fit(X_resampled, y_resampled)

In [None]:
test = pd.read_csv(r"../data/test.csv")
test.columns = [col.strip().lower() for col in test.columns]

submission_prob = rf.predict_proba(test)[:, 1]
test['Class'] = submission_prob.round(1)
submission = test[['id','Class']]
submission.to_csv(r"../data/submission_with_smote.csv",index=False)