# ML Project Station 1 -- Online Fraud Detection

Tatev Stepanyan

README file included

In [8]:
# importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## Data on the first sight and its cleaning

In [None]:
# importing the dataset
# df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

# comment the lines below if it is not google colab you are using
from google.colab import drive
drive.mount('PS_20174392719_1491204439457_log.csv')

df = pd.read_csv("PS_20174392719_1491204439457_log.csv")


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


### Dataset feautures

    step: represents a unit of time where 1 step equals 1
    hourtype: type of online transaction
    amount: the amount of the transaction
    nameOrig: customer starting the transaction
    oldbalanceOrg: balance before the transaction
    newbalanceOrig: balance after the transaction
    nameDest: recipient of the transaction
    oldbalanceDest: initial balance of recipient before the transaction
    newbalanceDest: the new balance of recipient after the transaction
    isFraud: fraud transaction

## Visualization

In [None]:
fig1 = px.pie(df[df['isFraud'] == 1], values="amount", names="type", title="Fraudulent transactions by type")
fig1.show()


In [None]:
fig2 = px.pie(df, values="amount", names="type", title="All transactions by type")
fig2.show()


In [None]:
# dropping categorical variables

# df.drop(['nameDest', 'nameOrig', 'isFlaggedFraud'], axis=1, inplace=True)

df.drop(['nameDest', 'nameOrig', 'isFlaggedFraud'], axis=1, inplace=True)


In [None]:
# checking if there are null values. if yes, then clean the data

if df.isnull().sum().any() == 0:
    print("No missing values in the dataset")
else:
    df.fillna(df.mean(), inplace=True)
    print("Missing values filled with mean values")


Conclusion: no NA values, categorical and numerical features are present

## Data preprocessing

In [None]:
# converting categorical variables into numerical for the further processes

# x_cols = ['step', 'hourtype', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
# x_col = ['type']

# X = df[x_cols]
# X_categorical = df[x_col]

# cat_encoder = OneHotEncoder()
# X_encoded = cat_encoder.fit_transform(X_categorical)
# X_encoded_arr = X_encoded.toarray()

# X_new = pd.concat([X.reset_index(drop=True), pd.DataFrame(X_encoded_arr)], axis=1)

# --------------------------

x_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
X = df[x_cols]

X_encoded = pd.get_dummies(df['type'], prefix='type')
X_new = pd.concat([X, X_encoded], axis=1)


In [None]:
# feature scaling

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)


In [None]:
# X is ready, now y (the target variable)

y = df['isFraud']

In [None]:
# splitting the data

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


## Model testing and fitting

### 1. Decision tree model

In [None]:
# the model initialization and the fitting itself

dec_tree_model = DecisionTreeClassifier(random_state=42)
dec_tree_model.fit(X_train, y_train)


In [None]:
# accuracy

accuracy = dec_tree_model.score(X_test, y_test)
print(f'Accuracy for Decision tree model: {accuracy}')


In [None]:
# predictions

dec_tree_pred = dec_tree_model.predict(X_test)
print(f'Decision tree predictions: {dec_tree_pred}')


#### Hyperparameter tuning for the Decision tree model

In [None]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
print(f'Best parameters: {grid_search.best_params_}')

best_model = grid_search.best_estimator_
best_acc = best_model.score(X_test, y_test)

print(f'Best Decision tree accuracy: {best_acc}')

y_pred = best_model.predict(X_test)


In [None]:
# MSE and RMSE of the model

# new_mse = mean_squared_error(y_test, y_pred)
# new_rmse = np.sqrt(new_mse)
# new_r2 = r2_score(y_test, y_pred)

# dec_tree_mse = mean_squared_error(y_test, dec_tree_pred)
# dec_tree_rmse = np.sqrt(dec_tree_mse)
# dec_tree_r2 = r2_score(y_test, dec_tree_pred)

# print("Decision tree model VS Tuned model")
# print(f"\tMSE: {decision_tree_mse} VS {new_mse}")
# print(f"\tRMSE: {decision_tree_rmse} VS {new_rmse}")
# print(f"\tR2 score: {decision_tree_r2} VS {new_r2}")


### 2. Logistic regression

In [None]:
# the same steps as above (except the hyperparameter part), same applied for everything below

logistic_reg = LogisticRegression(max_iter=1000)
logistic_reg.fit(X_train, y_train)


In [None]:
logistic_reg_accuracy = logistic_reg.score(X_test, y_test)
print(f'Accuracy for Logistic regression model: {logistic_reg_accuracy}')


In [None]:
logistic_reg_pred = logistic_regression.predict(X_test)
print(f'Logistic regression predictions: {logistic_reg_pred}')


In [None]:
# logistic_reg_mse = mean_squared_error(y_test, logistic_reg_pred)
# logistic_reg_rmse = np.sqrt(logistic_regression_mse)
# logistic_reg_r2 = r2_score(y_test, logistic_reg_pred)

# print(f"Logistic regression MSE: {logistic_reg_mse}")
# print(f"Logistic regression RMSE: {logistic_reg_rmse}")
# print(f"Logistic regression R2 score: {logistic_reg_r2}")


### 3. Random forest

In [None]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)


In [None]:
random_forest_accuracy = random_forest.score(X_test, y_test)
print(f'Accuracy for Random forest model: {random_forest_accuracy}')


In [None]:
random_forest_pred = random_forest.predict(X_test)
print(f'Random Forest predictions:" {random_forest_pred}')


In [None]:
# random_forest_mse = mean_squared_error(y_test, random_forest_pred)
# random_forest_rmse = np.sqrt(random_forest_mse)
# random_forest_r2 = r2_score(y_test, random_forest_pred)

# print(f"Random forest MSE: {random_forest_mse}")
# print(f"Random forest RMSE: {random_forest_rmse}")
# print(f"Random forest R2 score: {random_forest_r2}")


In [None]:
importance_df_sorted = importance_df_sorted.sort_values('Random Forest Importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(importance_df_sorted['Feature'], importance_df_sorted['Random Forest Importance'], color='blue')
ax.set_xlabel('Random forest importance')
ax.set_ylabel('Feature')
ax.set_title('Feature importances from Random forest')

ax.invert_yaxis()

plt.show()

In [None]:
features = x_cols + list(X_encoded.columns)
rf_feature_importances = random_forest.feature_importances_

importance_df = pd.DataFrame({
    'Feature': features,
    'Random forest importance': rf_feature_importances
})

importance_df_sorted = importance_df.sort_values(by='Random forest importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.title('Feature importance - Random forest')
plt.show()


### 4. Gradient boosting classifier

In [None]:
gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train, y_train)


In [None]:
gradient_boosting_accuracy = gradient_boosting.score(X_test, y_test)
print(f'Accuracy for Gradient boosting model: {gradient_boosting_accuracy}')


In [None]:
gradient_boosting_pred = gradient_boosting.predict(X_test)
print(f'Gradient boosting predictions: {gradient_boosting_pred}')


In [None]:
# gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_pred)
# gradient_boosting_rmse = np.sqrt(gradient_boosting_mse)
# gradient_boosting_r2 = r2_score(y_test, gradient_boosting_pred)

# print(f"Gradient boosting MSE: {gradient_boosting_mse}")
# print(f"Gradient boosting RMSE: {gradient_boosting_rmse}")
# print(f"Gradient boosting R2 score: {gradient_boosting_r2}")


### Final decision making

In [None]:
# by accuracy

best_accuracy = max(dec_tree_accuracy, best_acc, logistic_reg_accuracy, random_forest_accuracy, gradient_boosting_accuracy)

print("\nBest model based on accuracy:")
if best_accuracy == dec_tree_accuracy:
    print("Decision tree classifier")
elif best_accuracy == best_acc:
    print("Tuned Decision tree classifier")
elif best_accuracy == logistic_reg_accuracy:
    print("Logistic regression")
elif best_accuracy == random_forest_accuracy:
    print("Random forest classifier")
elif best_accuracy == gradient_boosting_accuracy:
    print("Gradient boosting classifier")
else:
    print("Error")


In [None]:
# CHAT GPT

df['isFlaggedFraud'] = best_model.predict(X_scaled)

# Comparing the predictions with the actual values
comparison = pd.DataFrame({
    'Actual': df['isFraud'],
    'Predicted': df['isFlaggedFraud']
})

# Confusion matrix
conf_matrix = confusion_matrix(df['isFraud'], df['isFlaggedFraud'])
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(df['isFraud'], df['isFlaggedFraud'])
print("Classification Report:")
print(class_report)

# ROC AUC Score
roc_auc = roc_auc_score(df['isFraud'], df['isFlaggedFraud'])
print(f"ROC AUC Score: {roc_auc}")

# Plotting the ROC Curve
fpr, tpr, thresholds = roc_curve(df['isFraud'], df['isFlaggedFraud'])
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


# The end.