# Data Science & Machine Learning

# Importing Libraries 
A Python library is a collection of related modules. It contains bundles of code that can be used repeatedly in different programs. It makes Python Programming simpler and convenient for the programmer.

In [7]:
import pandas as pd
import numpy as np

# 1. Data cleaning including missing values, outliers and multi-collinearity

In [3]:
#dataset Loading
df=pd.read_csv('Fraud.csv')

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   amount          float64
 2   oldbalanceOrg   float64
 3   newbalanceOrig  float64
 4   oldbalanceDest  float64
 5   newbalanceDest  float64
 6   isFraud         int64  
 7   isFlaggedFraud  int64  
dtypes: float64(5), int64(3)
memory usage: 388.3 MB


In [17]:
# Check missing values
df.isnull().sum()

step              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [18]:
# Check for outliers
def iqr_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - (1.5 * IQR)
  upper_bound = Q3 + (1.5 * IQR)
  return (df[column] < lower_bound) & (df[column] > upper_bound)  

outliers = iqr_outliers(df, 'amount')  
outliers.sum()

0

In [19]:
# Feature selection- by removing non-numeric values to find correlations
df = df[ ["step","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest","isFraud","isFlaggedFraud"] ]
df

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,9839.64,170136.00,160296.36,0.00,0.00,0,0
1,1,1864.28,21249.00,19384.72,0.00,0.00,0,0
2,1,181.00,181.00,0.00,0.00,0.00,1,0
3,1,181.00,181.00,0.00,21182.00,0.00,1,0
4,1,11668.14,41554.00,29885.86,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...
6362615,743,339682.13,339682.13,0.00,0.00,339682.13,1,0
6362616,743,6311409.28,6311409.28,0.00,0.00,0.00,1,0
6362617,743,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,0
6362618,743,850002.52,850002.52,0.00,0.00,0.00,1,0


In [20]:
# Data-splitting
from sklearn.model_selection import train_test_split

In [21]:
X = df[['step', 'amount']]
Y = df['isFraud']

In [22]:
X

Unnamed: 0,step,amount
0,1,9839.64
1,1,1864.28
2,1,181.00
3,1,181.00
4,1,11668.14
...,...,...
6362615,743,339682.13
6362616,743,6311409.28
6362617,743,6311409.28
6362618,743,850002.52


In [23]:
Y

0          0
1          0
2          1
3          1
4          0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 6362620, dtype: int64

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 32)

In [25]:
# Fraud Detection Model Training
# Create and train the logistic regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [26]:
X_test

Unnamed: 0,step,amount
4789946,345,47312.02
2678785,210,124215.32
5221322,370,118309.52
19881,8,6435.64
3473074,258,1253.78
...,...,...
5313774,373,20507.53
4393188,321,5064.98
5929344,404,9988.71
1401661,139,42176.59


In [27]:
Y_test

4789946    0
2678785    0
5221322    0
19881      0
3473074    0
          ..
5313774    0
4393188    0
5929344    0
1401661    0
2668868    0
Name: isFraud, Length: 1908786, dtype: int64

In [28]:
# Make predictions on the testing set
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
# Demonstrate the performance of the model by using best set of tools. 
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9987106988420913


In [30]:
# Classification algo Report
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906376
           1       0.00      0.00      0.00      2410

    accuracy                           1.00   1908786
   macro avg       0.50      0.50      0.50   1908786
weighted avg       1.00      1.00      1.00   1908786



# 2. Describe your fraud detection model in elaboration.
1. Data Collection: Gathered and compiled raw data from various sources.
2. Data Preparation: Cleaned the data, addressed missing values and outliers, and selected relevant numeric features.
3. Train-Test Split: Split the data into a 65% training set and a 35% testing set.
4. Model Selection: Chose logistic regression as the model based on the problem's requirements.
5. Model Training: Trained the logistic regression model on the training set
6. Prediction: Made predictions using the testing set.
7. Evaluation: Achieved 99.87% accuracy, assessed with a classification report, including precision, recall, and F1-score metrics.

# 3. How did you select variables to be included in the model?
> Variable Selection: Chose variables based on their relevance to the fraud detection problem.

> Exclusion of Non-Numeric Columns: Initially excluded non-numeric columns to focus on numeric features.

> Selected Features: Included transaction details such as step, amount, oldbalanceOrg, newbalanceOrig, oldbalanceDest, newbalanceDest, and isFlaggedFraud.

> Feature Rationale: These features were selected because they likely contain key information about transaction nature and potential discrepancies that may indicate fraud.

# 4. Demonstrate the performance of the model by using best set of tools.
Performance Metrics: 

> Accuracy: The proportion of correctly classified instances among all instances. Calculated using accuracy_score(Y_test, y_pred).

> Classification Report: Provides detailed metrics including precision, recall, and F1-score for both classes (fraud and non-fraud).

# 5. What are the key factors that predict fraudulent customer?
> Transaction Amount: Unusually large or small transaction amounts can be indicative of fraud.

> Account Balance Changes: Discrepancies between old and new balances, both for the origin and destination accounts, can signal suspicious activity.

> Transaction Frequency and Timing: Frequent transactions within a short time frame or transactions at unusual times may suggest fraudulent behavior.

> Transaction Destination: Transactions sent to accounts with no previous interaction or to high-risk destinations can be a red flag.

# 6. Do these factors make sense? If yes, How? If not, How not?  
Yes, these factors make sense. Here's how:

> Transaction Amount: Unusually large or small amounts can signal fraud.

>Account Balance Changes: Discrepancies in balances may indicate unauthorized transactions.

> Transaction Destination: Transfers to unfamiliar or high-risk accounts may be fraudulent.

> Mismatch in Patterns: Transactions that deviate from a customer's usual behavior can suggest fraud.

# 7. What kind of prevention should be adopted while company update its infrastructure?
Data Security, Backup and Recovery, Access Control, Network Security, Monitoring and Auditing, Compliance.

# 8. Assuming these actions have been implemented, how would you determine if they work?
> Monitoring and Logging: Continuously monitor system and network logs for signs of security incidents or anomalies. Effective prevention should result in fewer or no security breaches.

> Regular Audits: Conduct regular security audits and vulnerability assessments to ensure that all implemented measures are functioning correctly and effectively addressing potential risks.

>Penetration Testing: Perform periodic penetration tests to simulate attacks and evaluate the robustness of your infrastructure against real-world threats.

>Reduced Fraud Incidents: Monitor the number of detected fraud cases before and after implementing new measures.