# Importing packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from scipy import stats
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
import gc

# Reading data with pandas




In [None]:
df = pd.read_csv('/kaggle/input/predicting-fraudulent-transactions/Fraud.csv')

In [None]:
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Checking is there any null values present in every column?

In [None]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

# Counting Unique Values in every column

In [None]:
for col in df.columns:
    print(col, df[col].nunique())

step 743
type 5
amount 5316900
nameOrig 6353307
oldbalanceOrg 1845844
newbalanceOrig 2682586
nameDest 2722362
oldbalanceDest 3614697
newbalanceDest 3555499
isFraud 2
isFlaggedFraud 2


# Counting unique classes in dependent column

In [None]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [None]:
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

# Removing unwanted column

In [None]:
drop_columns = ['nameOrig', 'nameDest']
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0


# Geting dummies for categorical(Object) column

In [None]:
df1 = pd.get_dummies(df['type'])
df.drop(['type'], axis=1, inplace=True)

In [None]:
data = pd.concat([df, df1], axis=1)
del df, df1
gc.collect()

958

In [None]:
data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0


# Creating new column with existing 'amount' column

In [None]:
data['is>200'] = data['amount'] > 200

In [None]:
data['is>200'] = data['is>200'].astype('int')

In [None]:
data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,is>200
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0,1
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0,1
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0,1


In [None]:
data1 = data.copy()

# Normalizing Column which has continuos values

In [None]:
nor = ['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest']
for i in nor:
    data[i] = stats.zscore(data[i], axis=None)

In [None]:
data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,is>200
0,1,-0.28156,-0.22981,-0.237622,-0.323814,0.0,0,0,0,0,0,1,0,1
1,1,-0.294767,-0.281359,-0.285812,-0.323814,0.0,0,0,0,0,0,1,0,1
2,1,-0.297555,-0.288654,-0.292442,-0.323814,0.0,1,0,0,0,0,0,1,0
3,1,-0.297555,-0.288654,-0.292442,-0.317582,0.0,1,0,0,1,0,0,0,0
4,1,-0.278532,-0.274329,-0.282221,-0.323814,0.0,0,0,0,0,0,1,0,1


# Model Creation Training and Testing - KNN

 Here i have used KNN Model which ia a supervised machine learning algorithm.It classifies and predict a new type of data instances with the simalrity of the training set(instances).So, this model calculates the distance between the new data point and all training instances and further it will selects the K nearest neighbours and conclude the class label or predict the output values based on majority or average of neighbours. 

In [None]:
x = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = data['isFraud']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=10, shuffle=True, stratify=y)

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)

In [None]:
score = f1_score(y_test, y_pred)
score

0.5922182920667004

# Another Model Creation with Gradient Boooting

The second method i have used here is Gradient Boosting, It is machine learning technique which comes under the ensembling technique. This technique will create a initial model and with simple predictions based on the average value of targeted variable. In further Iteration, the new trees will be added with thecombined prediction previous tree. The New tree is to minimize the errors and residue in the previous trees.

In [None]:
x = data1.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = data1['isFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=10, shuffle=True, stratify=y)

In [None]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)

In [None]:
y_pred = gb.predict(x_test)

In [None]:
score = f1_score(y_test, y_pred)
score

0.7748022755654224

In [None]:
print(f'Classification Report of Gradient Boost Classifier: \n\n{classification_report(y_test,y_pred)}')

Classification Report of Gradient Boost Classifier: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3177203
           1       0.90      0.68      0.77      4107

    accuracy                           1.00   3181310
   macro avg       0.95      0.84      0.89   3181310
weighted avg       1.00      1.00      1.00   3181310



# Model Creation with Extream GB

The third model ihave used here is ExtremeGb.It makes an Optimized Implementaion from the Gradient Boosting algorithm.It leads us to high performance model.It prevents overfitting.

In [None]:
x = data1.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = data1['isFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=10, shuffle=True, stratify=y)

In [None]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
y_pred = xgb.predict(x_test)

In [None]:
score = f1_score(y_test, y_pred)
score

0.9172893584127803

In [None]:
print(f'Classification Report of XGB Classifier: \n\n{classification_report(y_test,y_pred)}')

Classification Report of XGB Classifier: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3177203
           1       0.97      0.87      0.92      4107

    accuracy                           1.00   3181310
   macro avg       0.99      0.93      0.96   3181310
weighted avg       1.00      1.00      1.00   3181310



# Conclusion :

              Comparing all the above models ** XGB works better**

## Fraud Detection Model :: 1 - KNN, 2- Gradiant Boosting, 3- ExtremeGB

## 1. KNN

  Here i have used KNN Model which ia a supervised machine learning algorithm.It classifies and predict a new type of data instances with the simalrity of the training set(instances).So, this model calculates the distance between the new data point and all training instances and further it will selects the K nearest neighbours and conclude the class label or predict the output values based on majority or average of neighbours. 

    **F1 SCORE** for this  - 0.5922182920667004

## 2. Gradient Bossting

The second model i have used here is Gradient Boosting, It is machine learning technique which comes under the ensembling technique. This technique will create a initial model and with simple predictions based on the average value of targeted variable. In further Iteration, the new trees will be added with thecombined prediction previous tree. The New tree is to minimize the errors and residue in the previous trees.

    **F1 Score** for this - 0.7748022755654224 



## 3. ExtremeGB

The third model ihave used here is ExtremeGb.It makes an Optimized Implementaion
from the Gradient Boosting algorithm.It leads us to high performance model.It prevents overfitting.

    **F1 Score** this - 0.7748022755654224


##Variables to Be Included

According to the Data Dictionary of this Statement<

**step** - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

**type** - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

**amount** - amount of the transaction in local currency.

**nameOrig** - customer who started the transaction

**oldbalanceOrg** - initial balance before the transaction

**newbalanceOrig** - new balance after the transaction

**nameDest** - customer who is the recipient of the transaction

**oldbalanceDest** - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

**newbalanceDest** - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

**isFraud** - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

**isFlaggedFraud** - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

##Key Factors that predict Fradulent Customer

The major key factor i have considerd to predict the Fardulent Customer are:-

Transaction Type, Transaction Amount, Balance Differentials, Customer Behavior(typical pattern of Transaction)


##Do these factors make sense?

Yes, these factors makes sense. These factors align with the common patterns and indictors observed in Financial Domain. Soo, by considering these factor' relationships we can differntiate the fraudulent customers from the legitiamte ones.

## Kind of prevention should be adopted while company update its infrastructure?

The company should adopt some major preventions and  so it can prevent from fraudelnt activities. So, the Major preventions to be considered are: Robust Authentication and Authorization, Real-Time Monitoring and Alert Systems,
Anomaly Detection and Machine Learning, Transaction Monitoring and Anti-Money Laundering (AML) Procedures, Customer Verification and KYC, Employee Training and Awareness, Regular Security Audits and Assessments.


## Assuming these actions have been implemented, how would you determine if they work?

To determine the implemented actions whether working or not; we should employ some evaluation methods for evaluation. So, some of they are : Monitoring Key Performance Indicators (KPIs), Comparative Analysis, Benchmarking, Continuous Testing and Simulation, Internal and External Audits, Compliance Monitoring