# Title: Project ML:  Fraud Detection for Online Transactions

##### The project follow following steps 
1) Data Collection  
2) Data Cleaning and Preprocessing  
3) Visualize the Data  
4) Split the Data  
5) Label encoding  
6) Model Training  
7) Model Evaluation  
8) Optimize the Model  
9) Web Application Development  
10) Deployment and Group Presentation

## 1) Data Collection

# 4) Split the Data  

In [1]:
# Import dependencies
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load cleaned data
clean_train_data = pd.read_csv("Resources/clean_train_data.csv")
clean_test_data = pd.read_csv("Resources/clean_test_data.csv")


In [4]:
# Separate features and target in the train data
# Separate the target variable (isFraud) from the features
X = clean_train_data.drop("isFraud", axis=1)
y= clean_train_data["isFraud"]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## 5) Label encoding 

In [6]:
# Create a function to apply label encoding to all categorical columns
def label_encode(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            label_encoder = LabelEncoder()
            df[column] = label_encoder.fit_transform(df[column])
    return df

# Apply label encoding to the train and test data
X_train = label_encode(X_train)
X_test = label_encode(X_test)

# Train and evaluate your model
# (your code for training and evaluating the AdaBoostClassifier)

In [7]:
# Standardize the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



##  6) Model Training  by using RandomForestClassifier

In [None]:
# Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)

## 7) Model Evaluation

In [None]:
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

## 8)   Model Optimization

In [None]:
# Train an AdaBoostClassifier with base_estimator set as DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
clf_ada = AdaBoostClassifier(random_state=1, n_estimators=50, base_estimator=DecisionTreeClassifier(max_depth=2)).fit(X_train_scaled, y_train)

# Check the performance of the AdaBoostClassifier
print(f'AdaBoost Training Score: {clf_ada.score(X_train_scaled, y_train)}')
print(f'AdaBoost Testing Score: {clf_ada.score(X_test_scaled, y_test)}')

## 9) Final Model Evaluation

In [8]:
# Import classifiers
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

# Train a RandomForestClassifier
rfc = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print("Random Forest Classifier:")
print(f'Training Score: {rfc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rfc.score(X_test_scaled, y_test)}')
y_pred_rfc = rfc.predict(X_test_scaled)
print(classification_report(y_test, y_pred_rfc))

# Train an ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print("\nExtremely Random Trees Classifier:")
print(f'Training Score: {etc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {etc.score(X_test_scaled, y_test)}')
y_pred_etc = etc.predict(X_test_scaled)
print(classification_report(y_test, y_pred_etc))

# Train an AdaBoostClassifier
abc = AdaBoostClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print("\nAdaBoost Classifier:")
print(f'Training Score: {abc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {abc.score(X_test_scaled, y_test)}')
y_pred_abc = abc.predict(X_test_scaled)
print(classification_report(y_test, y_pred_abc))

Random Forest Classifier:
Training Score: 0.999778733588467
Testing Score: 0.9801469841162326
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    142497
           1       0.93      0.47      0.62      5138

    accuracy                           0.98    147635
   macro avg       0.95      0.73      0.80    147635
weighted avg       0.98      0.98      0.98    147635


Extremely Random Trees Classifier:
Training Score: 1.0
Testing Score: 0.9809462525823822
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    142497
           1       0.92      0.50      0.65      5138

    accuracy                           0.98    147635
   macro avg       0.95      0.75      0.82    147635
weighted avg       0.98      0.98      0.98    147635


AdaBoost Classifier:
Training Score: 0.9705106061119202
Testing Score: 0.9705760829071697
              precision    recall  f1-score   support

           0    

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}



In [None]:
# Define the Random Forest classifier with default hyperparameters
rfc = RandomForestClassifier(random_state=1)


In [None]:
# Use GridSearchCV to search over the parameter grid and find the best hyperparameters
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1')
grid_search.fit(X_train_scaled, y_train)


In [None]:
# Print the best hyperparameters and the corresponding F1 score on the validation set
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Validation F1 score: {grid_search.best_score_}")

In [None]:
# Evaluate the performance of the best model on the testing set
best_model = grid_search.best_estimator_
test_f1_score = cross_val_score(best_model, X_test_scaled, y_test, cv=5, scoring='f1').mean()
print(f"Testing F1 score: {test_f1_score}")