## Uploading and Reading the data

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
data=pd.read_csv("C:\\Users\\Ruchi Goyal\\Downloads\\Fraud.csv")
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Checking for the null or missing values


In [3]:
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [4]:
data.shape

(6362620, 11)

In [5]:
data["type"].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [6]:
data.dtypes


step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

## Outliers were not removed because: Fraud transactions themselves are extreme. Removing them would reduce fraud signal. Scaling was applied using StandardScaler to ensure numerical stability


In [7]:
data.corr()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
step,1.0,0.022373,-0.010058,-0.010299,0.027665,0.025888,0.031578,0.003277
amount,0.022373,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688,0.012295
oldbalanceOrg,-0.010058,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154,0.003835
newbalanceOrig,-0.010299,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148,0.003776
oldbalanceDest,0.027665,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885,-0.000513
newbalanceDest,0.025888,0.459304,0.042029,0.041837,0.976569,1.0,0.000535,-0.000529
isFraud,0.031578,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0,0.044109
isFlaggedFraud,0.003277,0.012295,0.003835,0.003776,-0.000513,-0.000529,0.044109,1.0


## Multicollinearity: Strong correlation existed between:Old and new balance variables. Multicollinearity does not affect prediction accuracy but impacts coefficient stabilitySince the goal was prediction, correlated variables were retained



In [8]:
data["balanceOrg"]=data["newbalanceOrig"]-data["oldbalanceOrg"] 
data["balanceDest"]=data["newbalanceDest"]-data["oldbalanceDest"] 
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,balanceOrg,balanceDest
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0


In [9]:
df=data.drop(["step","nameOrig","nameDest","isFlaggedFraud"],axis=1)
df.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,balanceOrg,balanceDest
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,-9839.64,0.0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,-1864.28,0.0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,-181.0,0.0
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,-181.0,-21182.0
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,-11668.14,0.0


In [10]:
categorical=["type"]
numeric=["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]

In [11]:
y=df["isFraud"]
x=df.drop("isFraud",axis=1)

## 2. Fraud Detection Model (Explanation): Treated fraud detection as a binary classification problem.Logistic Regression chosen because:
### Interpretable coefficients
### Fast and scalable
### Suitable baseline for fraud detection
### Pipeline Components
### Numerical features → StandardScaler
### Categorical features → OneHotEncoder
### Classifier → Logistic Regression with class_weight='balanced'
### Class imbalance was handled by assigning higher penalty to misclassified fraud cases.


## 3. How did you select variables to be included in the model? 
### Variables were selected using:Domain understanding of transaction behavior,Correlation analysis,Removal of identifiers or non-predictive fields,
### Included variables:Amount,Type,Balance changes.


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.3,stratify=y)
preprocessor=ColumnTransformer(
    transformers=[
        ("num",StandardScaler(),numeric),
        ("cat",OneHotEncoder(drop="first"),categorical)
    ],
    remainder="drop"
)

pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000))])

pipe.fit(x_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['amount', 'oldbalanceOrg',
                                                   'newbalanceOrig',
                                                   'oldbalanceDest',
                                                   'newbalanceDest']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['type'])])),
                ('clf',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [14]:
y_pred=pipe.predict(x_test)

## 4. Demonstrate the performance of the model by using best set of tools.
### Tools Accuracy alone was insufficient due to imbalance.
### Used:
### Confusion Matrix
### Precision, Recall, F1-score

## 5. Key Factors Predicting Fraud: Large transaction amounts,Certain transaction types (e.g., transfers, cash-outs),Sudden balance changes
## 6. Do these factors make sense? If yes, How? If not, How not?

### Yes, Fraudsters typically move large sums rapidly.Specific transaction types are known fraud channels. Balance inconsistencies indicate suspicious activity. These align with real-world financial fraud behavior.









In [15]:
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

Accuracy score: 0.9471407480985297
Confusion Matrix: [[1805576  100746]
 [    151    2313]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      0.95      0.97   1906322
           1       0.02      0.94      0.04      2464

    accuracy                           0.95   1908786
   macro avg       0.51      0.94      0.51   1908786
weighted avg       1.00      0.95      0.97   1908786



## Accuracy = 95% which is quite good thus our model is good to use.

### Out of 1908786, 95% prediction are correct. 

## Rooms of improvements in precision as there is huge difference for 0 and 1 values.

## 7. What kind of prevention should be adopted while company update its infrastructure?
### 7. Immediate Alerts: Systems should use automated alerts for irregular activities, such as sudden spikes in transaction volume or high-value transfers to unfamiliar accounts.
### Predictive Modeling: Leverage supervised learning (e.g., Random Forests, XGBoost, Neural Networks) trained on historical data to predict the likelihood of a transaction being fraudulent.
## 8. Assuming these actions have been implemented, how would you determine if they work?
### 8. Measuring Effectiveness of Prevention
### Effectiveness evaluated through:
### Reduction in fraud losses
### Improved fraud recall
### Stable false-positive rate
### Monitoring performance drift over time
