In [1]:
!pip install geopy scikit-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from geopy.distance import geodesic
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = '/content/fraudTest.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
cols_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'trans_num', 'dob', 'trans_date_trans_time']
data.drop(columns=cols_to_drop, inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['category'] = label_encoder.fit_transform(data['category'])
data['merchant'] = label_encoder.fit_transform(data['merchant'])

# Create distance feature (distance between transaction and merchant)
def calculate_distance(row):
   if pd.isna(row['lat']) or pd.isna(row['long']) or pd.isna(row['merch_lat']) or pd.isna(row['merch_long']):
        return 0
        user_location = (row['lat'], row['long'])
        merch_location = (row['merch_lat'], row['merch_long'])
        return geodesic(user_location, merch_location).kilometers

data['distance'] = data.apply(calculate_distance, axis=1)
data.drop(columns=['lat', 'long', 'merch_lat', 'merch_long'], inplace=True)

# Separate features and target
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define categorical and numerical features
categorical_features = ['gender', 'category', 'merchant']
numerical_features = ['amt', 'distance']

# Create preprocessing pipelines for categorical and numerical features
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore')), # Use OneHotEncoder to handle categorical features
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Apply preprocessing to training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Impute any remaining NaNs after preprocessing
imputer = SimpleImputer(strategy='mean') # Use mean imputation for any remaining NaNs
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    results[name] = {'AUC': auc, 'Report': report, 'Confusion Matrix': cm}
    print(f"\n{name} - AUC: {auc:.4f}")
    print(report)
    print(cm)




  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count



Logistic Regression - AUC: 0.4998
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.00      0.00      0.00       429

    accuracy                           1.00    111144
   macro avg       0.50      0.50      0.50    111144
weighted avg       0.99      1.00      0.99    111144

[[110681     34]
 [   429      0]]

Decision Tree - AUC: 0.7828
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.73      0.57      0.64       429

    accuracy                           1.00    111144
   macro avg       0.86      0.78      0.82    111144
weighted avg       1.00      1.00      1.00    111144

[[110624     91]
 [   186    243]]

Random Forest - AUC: 0.5000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.00      0.00      0.00       429

    accuracy       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
