In [1]:
!pip install imbalanced-learn
!pip install matplotlib
!pip install seaborn




In [2]:
# Basic Libraries
import pandas as pd
import numpy as np

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# Load the dataset
dataset = pd.read_csv('/content/fraudTest.csv')

# Optional: Remove any unwanted leading/trailing spaces
dataset.columns = dataset.columns.str.strip()


In [4]:
# Check dataset info and missing values
dataset.info()

# Check for any missing values
dataset.isnull().sum()

# Get a summary of the dataset
dataset.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,277859.0,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,555718.0,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [5]:
print(dataset.columns)


Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [6]:
# Define Features (X) and Target (y)
X = dataset.drop('is_fraud', axis=1)
y = dataset['is_fraud']


In [7]:
# Drop columns that are identifiers or leak info
X = X.drop(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city',
             'state', 'zip', 'job', 'dob', 'trans_num'], axis=1)


In [8]:
# One-hot encode categorical features
X = pd.get_dummies(X, columns=['merchant', 'category', 'gender'], drop_first=True)


In [9]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)


In [14]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluation Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9972918016267185
ROC-AUC Score: 0.6689208952994348

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.90      0.34      0.49       429

    accuracy                           1.00    111144
   macro avg       0.95      0.67      0.74    111144
weighted avg       1.00      1.00      1.00    111144


Confusion Matrix:
[[110698     17]
 [   284    145]]


In [20]:
sample_transaction = {
    'Unnamed: 0': 0,   # This can be anything
    'trans_date_trans_time': 1.709e9,  # Example unix time
    'cc_num': 1234567890,
    'merchant': 1,     # Need to be encoded properly if categorical
    'category': 1,     # Same here
    'amt': 125.0,
    'first': 1,
    'last': 1,
    'gender': 1,
    'street': 1,
    'city': 1,
    'state': 1,
    'zip': 10001,
    'lat': 40.7128,
    'long': -74.0060,
    'city_pop': 500000,
    'job': 1,
    'dob': 1,
    'trans_num': 1,
    'unix_time': 1709000000,
    'merch_lat': 40.7128,
    'merch_long': -74.0060
}


In [21]:
import pandas as pd

def predict_fraud(new_transaction, model, scaler, X_train_columns):
    """
    Predict if a new transaction is fraud or not.
    new_transaction: A dictionary containing the transaction details
    model: The trained model
    scaler: The trained scaler
    X_train_columns: Columns used during training
    """
    # Create a DataFrame for the new transaction
    new_transaction_df = pd.DataFrame([new_transaction])

    # Ensure the new data has the same columns as the training data
    # This step is necessary to avoid mismatched columns
    new_transaction_df = new_transaction_df.reindex(columns=X_train_columns, fill_value=0)

    # Scale the features of the new transaction
    new_transaction_scaled = scaler.transform(new_transaction_df)

    # Predict
    prediction = model.predict(new_transaction_scaled)

    if prediction[0] == 1:
        print("⚠️ The transaction is predicted to be FRAUDULENT.")
    else:
        print("✅ The transaction is predicted to be LEGITIMATE.")

# When calling this function, make sure to pass the columns used during training.
predict_fraud(sample_transaction, model, scaler, X.columns)


✅ The transaction is predicted to be LEGITIMATE.
