In [4]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import os

# Step 1: Define Chunk Size and Load Data in Chunks
chunk_size = 50000  # Set the chunk size based on system memory
file_path = "fraudTrain.csv"  # Replace with your dataset file path
output_cleaned_data = "test.csv"  # For cleaned data storage

# Initialize an empty list to collect chunks
cleaned_chunks = []

# Function to calculate distance
def calculate_distance(row):
    user_location = (row['lat'], row['long'])
    merchant_location = (row['merch_lat'], row['merch_long'])
    return geodesic(user_location, merchant_location).miles

# Process the dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Step 2: Data Cleaning
    # Parse datetime columns
    chunk['trans_date_trans_time'] = pd.to_datetime(chunk['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
    chunk['dob'] = pd.to_datetime(chunk['dob'], format='%Y-%m-%d')
    chunk['unix_time'] = pd.to_datetime(chunk['unix_time'], unit='s')

    # Create an age column
    chunk['age'] = (pd.Timestamp.now() - chunk['dob']).dt.days // 365

    # Calculate distance between user and merchant locations
    chunk['distance'] = chunk.apply(calculate_distance, axis=1)

    # Drop unnecessary columns to reduce memory usage
    chunk = chunk.drop(['street', 'trans_num'], axis=1)

    # Append cleaned chunk to the list
    cleaned_chunks.append(chunk)

# Combine all chunks into a single dataframe
cleaned_data = pd.concat(cleaned_chunks, ignore_index=True)

In [None]:
# Save cleaned data to a file for future use
cleaned_data.to_csv(output_cleaned_data, index=False)
print("Data cleaning completed and saved!")

# Step 3: Exploratory Data Analysis (EDA)
# Distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(cleaned_data['amt'], bins=50, kde=True, color='blue')
plt.title('Transaction Amount Distribution')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Fraud vs. Non-Fraud transaction counts
plt.figure(figsize=(6, 4))
sns.countplot(x='is_fraud', data=cleaned_data, palette='Set2')
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()



In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation = cleaned_data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Step 4: Prepare Data for Machine Learning
# Feature selection
features = ['amt', 'age', 'city_pop', 'distance']
X = cleaned_data[features]
y = cleaned_data['is_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train a Machine Learning Model
# Use Random Forest for fraud detection
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 6: Evaluate the Model
# Predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluation Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Step 7: Save the Model for Deployment
import joblib
model_file = "fraud_detection_model.pkl"
joblib.dump(rf_model, model_file)
print(f"Model saved as {model_file}!")

# Step 8: Automate Pipeline for Large Dataset (if needed)
# Define a function for automation
def fraud_detection_pipeline(file_path, chunk_size=50000):
    cleaned_chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Clean and process each chunk
        chunk['trans_date_trans_time'] = pd.to_datetime(chunk['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
        chunk['dob'] = pd.to_datetime(chunk['dob'], format='%Y-%m-%d')
        chunk['unix_time'] = pd.to_datetime(chunk['unix_time'], unit='s')
        chunk['age'] = (pd.Timestamp.now() - chunk['dob']).dt.days // 365
        chunk['distance'] = chunk.apply(calculate_distance, axis=1)
        chunk = chunk.drop(['street', 'trans_num'], axis=1)
        cleaned_chunks.append(chunk)
    
    cleaned_data = pd.concat(cleaned_chunks, ignore_index=True)
    return cleaned_data


In [None]:
import pickle
with open('model_pickle.obj','wb') as f:
    pickle.dump(model_file,f)
with open('model_pickle.obj','rb') as f:
    mp=pickle.load(f)
Age_prediction_lin2=mp.predict(X_test)