<a href="https://colab.research.google.com/github/sauravdev/Concise-Machine-Learning/blob/main/Fraud_detection_Saurav.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import pandas as pd

# Helper function to generate a random transaction amount
def generate_amount():
    return round(random.uniform(0, 10000), 2)

# Helper function to generate a random account number
def generate_account():
    return ''.join(random.choices('0123456789', k=12))

# Helper function to generate a random transaction date
def generate_date():
    year = random.randint(2010, 2020)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    return f"{year}-{month:02d}-{day:02d}"

# Helper function to generate a random transaction type
def generate_type():
    return random.choice(['debit', 'credit'])

# Helper function to generate a random fraud label
def generate_fraud():
    return random.choice([0, 1])

# Helper function to generate a sample dataset
def generate_data(n):
    data = []
    for i in range(n):
        amount = generate_amount()
        account = generate_account()
        date = generate_date()
        trans_type = generate_type()
        fraud = generate_fraud()
        data.append([amount, account, date, trans_type, fraud])
    return data

# Generate a sample dataset with 1000 transactions
data = generate_data(1000)

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=['amount', 'account', 'date', 'type', 'fraud'])

# Print the first 5 rows of the DataFrame
print(df.head())


    amount       account        date    type  fraud
0  1076.09  075673664569  2018-06-12   debit      1
1  1248.39  067602501139  2020-05-28   debit      0
2  7294.45  236886055294  2011-09-08   debit      0
3   434.66  276993126043  2013-04-16   debit      0
4  3848.44  642164390937  2012-10-04  credit      1


In [None]:
# Check for missing values in the DataFrame
print(df.isnull().sum())

# Remove any rows with missing values
df = df.dropna()

# Check for duplicate rows in the DataFrame
print(df.duplicated().sum())

# Remove any duplicate rows
df = df.drop_duplicates()

# Check for invalid account numbers
print(df[~df['account'].str.contains('^\d{12}$')])

# Remove any rows with invalid account numbers
df = df[df['account'].str.contains('^\d{12}$')]

# Check for invalid date
print(df[~df['date'].str.match(r'\d{4}-\d{2}-\d{2}')])

# Remove any rows with invalid date
df = df[df['date'].str.match(r'\d{4}-\d{2}-\d{2}')]

#convert date to datetime object
df['date'] = pd.to_datetime(df['date'])

# Print the first 5 rows of the cleaned DataFrame
print(df.head())


amount     0
account    0
date       0
type       0
fraud      0
dtype: int64
0
Empty DataFrame
Columns: [amount, account, date, type, fraud]
Index: []
Empty DataFrame
Columns: [amount, account, date, type, fraud]
Index: []
    amount       account       date    type  fraud
0  1076.09  075673664569 2018-06-12   debit      1
1  1248.39  067602501139 2020-05-28   debit      0
2  7294.45  236886055294 2011-09-08   debit      0
3   434.66  276993126043 2013-04-16   debit      0
4  3848.44  642164390937 2012-10-04  credit      1


In [None]:
import numpy as np

# Create a new feature 'transaction_day'
df['transaction_day'] = df['date'].dt.day

# Create a new feature 'transaction_hour'
df['transaction_hour'] = df['date'].dt.hour

# Create a new feature 'transaction_weekday'
df['transaction_weekday'] = df['date'].dt.weekday

# Create a new feature 'transaction_weekend'
df['transaction_weekend'] = df['transaction_weekday'].apply(lambda x: 1 if x>4 else 0)

# Create a new feature 'transaction_month'
df['transaction_month'] = df['date'].dt.month

# Create a new feature 'transaction_year'
df['transaction_year'] = df['date'].dt.year

# Create a new feature 'log_amount'
df['log_amount'] = np.log(df['amount'] + 1)

# Create a new feature 'amount_bin'
df['amount_bin'] = pd.cut(df['amount'], bins=[0, 1000, 5000, 10000], labels=['low', 'medium', 'high'])

# Create a new feature 'amount_per_day'
df['amount_per_day'] = df.groupby(['transaction_day'])['amount'].transform('sum')

# Create a new feature 'amount_per_hour'
df['amount_per_hour'] = df.groupby(['transaction_hour'])['amount'].transform('sum')

# Create a new feature 'amount_per_weekday'
df['amount_per_weekday'] = df.groupby(['transaction_weekday'])['amount'].transform('sum')

# Create a new feature 'amount_per_month'
df['amount_per_month'] = df.groupby(['transaction_month'])['amount'].transform('sum')

# Create a new feature 'amount_per_year'
df['amount_per_year'] = df.groupby(['transaction_year'])['amount'].transform('sum')

# Create a new feature 'fraud_per_account'
df['fraud_per_account'] = df.groupby(['account'])['fraud'].transform('sum')

# Print the first 5 rows of the DataFrame
print(df.head())


    amount       account       date    type  fraud  transaction_day  \
0  1076.09  075673664569 2018-06-12   debit      1               12   
1  1248.39  067602501139 2020-05-28   debit      0               28   
2  7294.45  236886055294 2011-09-08   debit      0                8   
3   434.66  276993126043 2013-04-16   debit      0               16   
4  3848.44  642164390937 2012-10-04  credit      1                4   

   transaction_hour  transaction_weekday  transaction_weekend  \
0                 0                    1                    0   
1                 0                    3                    0   
2                 0                    3                    0   
3                 0                    1                    0   
4                 0                    3                    0   

   transaction_month  transaction_year  log_amount amount_bin  amount_per_day  \
0                  6              2018    6.982018     medium       140400.09   
1                  5

In [None]:
print(df)

categorical_columns = ['type','amount_bin']
df = pd.get_dummies(df, columns=categorical_columns)

      amount       account       date  fraud  transaction_day  \
0    1076.09  075673664569 2018-06-12      1               12   
1    1248.39  067602501139 2020-05-28      0               28   
2    7294.45  236886055294 2011-09-08      0                8   
3     434.66  276993126043 2013-04-16      0               16   
4    3848.44  642164390937 2012-10-04      1                4   
..       ...           ...        ...    ...              ...   
995  2099.18  115434251433 2013-08-15      1               15   
996  5014.09  148737745418 2010-05-01      0                1   
997  8057.10  986873100075 2018-02-06      0                6   
998  7687.20  136961482909 2011-10-13      1               13   
999  9719.25  308981220081 2014-12-10      0               10   

     transaction_hour  transaction_weekday  transaction_weekend  \
0                   0                    1                    0   
1                   0                    3                    0   
2                 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Split the data into training and test sets
X = df.drop(columns=['fraud','date'])
y = df['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
models = {'RandomForest': RandomForestClassifier(),
          'LogisticRegression': LogisticRegression(),
          'SVM': SVC()}

# Train and evaluate the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'{name} - Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}')


RandomForest - Accuracy: 1.000, Precision: 1.000, Recall: 1.000, F1: 1.000
LogisticRegression - Accuracy: 0.470, Precision: 0.000, Recall: 0.000, F1: 0.000
SVM - Accuracy: 0.470, Precision: 0.000, Recall: 0.000, F1: 0.000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(df)

      amount       account       date  fraud  transaction_day  \
0    1076.09  075673664569 2018-06-12      1               12   
1    1248.39  067602501139 2020-05-28      0               28   
2    7294.45  236886055294 2011-09-08      0                8   
3     434.66  276993126043 2013-04-16      0               16   
4    3848.44  642164390937 2012-10-04      1                4   
..       ...           ...        ...    ...              ...   
995  2099.18  115434251433 2013-08-15      1               15   
996  5014.09  148737745418 2010-05-01      0                1   
997  8057.10  986873100075 2018-02-06      0                6   
998  7687.20  136961482909 2011-10-13      1               13   
999  9719.25  308981220081 2014-12-10      0               10   

     transaction_hour  transaction_weekday  transaction_weekend  \
0                   0                    1                    0   
1                   0                    3                    0   
2                 

In [None]:
# Select the features
selected_features = ['transaction_day', 'transaction_hour', 'transaction_weekday', 'log_amount','amount_bin_low','amount_bin_medium','amount_bin_high','amount_per_day','amount_per_hour','amount_per_weekday','fraud_per_account']
X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]

# Train the selected model on the dataset using the selected features
selected_model = RandomForestClassifier()
selected_model.fit(X_train_sel, y_train)


RandomForestClassifier()

In [None]:
# Make predictions on the test set
y_pred = selected_model.predict(X_test_sel)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}')

# Optimize the model's performance
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 50, 100, 200],
              'max_depth': [2, 5, 10, 20]}
grid_search = GridSearchCV(selected_model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_sel, y_train)

# Print the best parameters and the best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best F1 score: {grid_search.best_score_:.3f}')


Accuracy: 1.000, Precision: 1.000, Recall: 1.000, F1: 1.000
Best parameters: {'max_depth': 2, 'n_estimators': 50}
Best F1 score: 1.000


In [None]:
import pickle

# Save the model to a file
with open('fraud_detection_model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# Load the model from a file
with open('fraud_detection_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Use the loaded model to make predictions on new data
new_data = X_test_sel.iloc[0:1,:]
print(new_data)
prediction = loaded_model.predict(new_data)
print(f'Prediction: {prediction}')


     transaction_day  transaction_hour  transaction_weekday  log_amount  \
521                4                 0                    2    9.020219   

     amount_bin_low  amount_bin_medium  amount_bin_high  amount_per_day  \
521               0                  0                1        153438.9   

     amount_per_hour  amount_per_weekday  fraud_per_account  
521       5047314.92            724188.3                  1  
Prediction: [1]


In [None]:

# Define a function to calculate the evaluation metrics
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

# Get the performance metrics for the current model
y_pred = loaded_model.predict(X_test_sel)
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred)

# Define a threshold for the evaluation metrics
thresholds = {'accuracy': 0.95, 'precision': 0.95, 'recall': 0.95, 'f1': 0.95}

# Check if the current model's performance is below the threshold
if accuracy < thresholds['accuracy'] or precision < thresholds['precision'] or recall < thresholds['recall'] or f1 < thresholds['f1']:
    # Retrain the model using more data or different techniques
    new_data = pd.read_csv('new_data.csv')
    X_new = new_data.drop(columns=['fraud','date'])
    y_new = new_data['fraud']
    X_new_sel = X_new[selected_features]
    X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new_sel, y_new, test_size=0.2, random_state=42)
    new_model = RandomForestClassifier()
    new_model.fit(X_train_new, y_train_new)
    y_pred_new = new_model.predict(X_test_new)
    accuracy_new, precision_new, recall_new, f1_new = evaluate_model(y_test_new, y_pred_new)
    if accuracy_new > accuracy and precision_new > precision and recall_new > recall and f1_new > f1:
        # Update the model
        loaded_model = new_model
        # Save the updated model to a file
        with open('fraud_detection_model.pkl', 'wb') as f:
            pickle.dump(loaded_model, f)
        print("Model updated")
    else:
        print("Model performance is still good, no update needed")
else:
    print("Model performance is still good, no update needed")


Model performance is still good, no update needed
