In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
!pip install mlflow dagshub
import dagshub
import mlflow
dagshub.init(repo_owner='skara-21', repo_name='Assignment2_Fraud_Detection', mlflow=True)

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=96a47721-4f27-4f59-b24e-3d67f146836c&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=f2d9709e9ecc777da908a60404cb82826eb79c20320195646205b31d8044cee1




Output()

In [3]:
train_id=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_trans=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [4]:
train=pd.merge(train_trans,train_id,on='TransactionID',how='left')

In [5]:
del train_trans,train_id

In [6]:
cat_columns=['ProductCD','addr1','addr2','P_emaildomain',
             'R_emaildomain','DeviceType','DeviceInfo',
            'user_has_ever_been_fraud','userId']
cat_columns+=['card'+str(i) for i in range(1,7)]
cat_columns+=['M'+str(i) for i in range(1,10)]
cat_columns+=['id_'+str(i) for i in range(12,39)]

num_columns=[col for col in train.columns.tolist() if col not in cat_columns and col != 'isFraud']

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

In [8]:
class DropMissing(BaseEstimator, TransformerMixin):
    def __init__(self,threshold=90):
        self.to_remove=None

    def fit(self,X, y=None):
        cols_with_missing = [col for col in X.columns
                         if X[col].isnull().any()]
        to_remove=[]
        for col in cols_with_missing:
            null_percentile=(X[col].isnull().mean())*100
            if null_percentile>90: to_remove.append(col)
        self.to_remove=to_remove
        print('dropped missing')
        return self

    def transform(self,X):
        if self.to_remove==None:
            raise Error('you must run fit method first to drop columns with missing values')
            return X
        else:
            return X.drop(self.to_remove,axis=1)

In [9]:
class FillNaN(BaseEstimator, TransformerMixin):
    def __init__(self,cat_columns,num_columns,threshold=90):
        self.to_remove=None
        self.num_columns=num_columns
        self.cat_columns=cat_columns
    def fit(self,X,y=None):
        X_cp=X.copy()
        to_remove={}
        for col in self.num_columns:
            if col in X_cp.columns:
                mode_val = X_cp[col].mode()[0]
                X_cp.fillna({col:mode_val}, inplace=True)
                to_remove[col]=mode_val
        self.to_remove = to_remove
        for col in self.cat_columns:
            X_cp.fillna({col:'missing'}, inplace=True)

        print('filled nan')
        return self

    def transform(self,X):
        if self.to_remove==None:
            raise Error('you must run fit method first to fill NaN values')
        else:
            X_cp=X.copy()
            for col,val in self.to_remove.items():
                X_cp.fillna({col:val}, inplace=True)
            return X_cp

In [10]:
from scipy.stats.mstats import winsorize
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self,num_columns,threshold=[0.01,0.01]):
        self.num_columns=num_columns
        self.threshold=threshold
    def fit(self,X,y=None):
        for col in self.num_columns:
            if col in X.columns:
                X[col]=winsorize(X[col], limits=self.threshold)

        print('removed outliers')
        return self
    def transform(self,X):
        for col in self.num_columns:
            if col in X.columns:
                X[col]=winsorize(X[col], limits=self.threshold)
        return X

In [11]:
from category_encoders import WOEEncoder

class BinaryNonBinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, binary_threshold=2):
        self.binary_threshold = binary_threshold
        self.binary_cols_ = None
        self.non_binary_cols_ = None
        self.woe_encoder_ = None
        self.ohe_columns_ = None
        
    def fit(self, X, y):
        self.binary_cols_ = [col for col in X.columns 
                           if X[col].nunique() <= self.binary_threshold]
        self.non_binary_cols_ = [col for col in X.columns 
                               if X[col].nunique() > self.binary_threshold]
        
        if self.non_binary_cols_:
            self.woe_encoder_ = WOEEncoder(
                cols=self.non_binary_cols_,

            )
            self.woe_encoder_.fit(X[self.non_binary_cols_], y)
        
        if self.binary_cols_:
            dummy_df = pd.get_dummies(X[self.binary_cols_], drop_first=True, dtype=int)
            self.ohe_columns_ = dummy_df.columns.tolist()

        print('encoded')
        
        return self
        
    def transform(self, X):
        X = X.copy()
        if self.non_binary_cols_ and self.woe_encoder_:
            X[self.non_binary_cols_] = self.woe_encoder_.transform(X[self.non_binary_cols_])
        
        if self.binary_cols_:
            dummy_df = pd.get_dummies(X[self.binary_cols_], drop_first=True, dtype=int)
            
            missing_cols = set(self.ohe_columns_) - set(dummy_df.columns)
            for col in missing_cols:
                dummy_df[col] = 0
            
            dummy_df = dummy_df.reindex(columns=self.ohe_columns_, fill_value=0)
            
            X = pd.concat([
                X.drop(columns=self.binary_cols_),
                dummy_df
            ], axis=1)
        
        return X
    
    def fit_transform(self, X, y=None, **fit_params):
        if y is None:
            raise ValueError("y cannot be None for WOE encoding")
        return self.fit(X, y).transform(X)

In [12]:
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class XGBRFE(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 n_features_to_select=150,
                 n_estimators=100,  # Reduced from 200
                 max_depth=6,       # Reduced from 8
                 learning_rate=0.05, # Reduced from 0.1
                 subsample=0.8,
                 colsample_bytree=0.8,
                 n_jobs=-1,
                 random_state=42,
                 verbosity=0):
        self.n_features_to_select = n_features_to_select
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbosity = verbosity
        self.selected_features_ = None
        self.support_ = None
        self.ranking_ = None

    def fit(self, X, y):
        print('Starting optimized feature selection...')
        
        # Lightweight estimator configuration
        estimator = XGBRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbosity=self.verbosity,
            tree_method='hist',  # More memory-efficient
            booster='gbtree',    # Default, but explicit
            objective='reg:squarederror'
        )
        
        # Two-phase feature selection
        if X.shape[1] > 200:  # Only if we have many features
            print('Phase 1: Quick elimination of worst features')
            initial_selector = RFE(
                estimator=estimator,
                n_features_to_select=min(200, X.shape[1]),
                step=max(1, X.shape[1]//20)  # Aggressive step size
            )
            initial_selector.fit(X, y)
            X = X.loc[:, initial_selector.support_]
        
        print('Phase 2: Final feature selection')
        final_selector = RFE(
            estimator=estimator,
            n_features_to_select=self.n_features_to_select,
            step=1  # More precise for final selection
        )
        final_selector.fit(X, y)
        
        # Store results
        if hasattr(X, 'columns'):
            self.selected_features_ = X.columns[final_selector.support_].tolist()
        self.support_ = final_selector.support_
        self.ranking_ = final_selector.ranking_
        
        print(f'Completed. Selected {len(self.selected_features_)} features.')
        return self

    def transform(self, X):
        if self.selected_features_ is None:
            raise RuntimeError("Must fit transformer before transforming")
            
        if isinstance(X, pd.DataFrame):
            return X[self.selected_features_]
        return X[:, self.support_]

    def get_support(self, indices=False):
        if indices:
            return np.where(self.support_)[0]
        return self.support_

In [13]:
class CorrelationRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.cols_to_drop_ = None
        
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            corr_matrix = X.corr().abs()
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            self.cols_to_drop_ = [col for col in X.columns 
                                if any(upper[col] > self.threshold)]

        print('removed correlations')
        return self
        
    def transform(self, X):
        if self.cols_to_drop_ is None:
            raise RuntimeError("Must fit transformer before transforming data")
        return X.drop(columns=self.cols_to_drop_)

In [14]:
print('done')

done


In [16]:
!pip install --upgrade scikit-learn==1.0.2 imbalanced-learn

Collecting scikit-learn==1.0.2
  Downloading scikit-learn-1.0.2.tar.gz (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (pyproject.toml) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See abo

In [19]:
# First ensure you have the right packages
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1
!pip install mlflow dagshub



# Training

In [20]:
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, average_precision_score, 
                           precision_score, recall_score, classification_report, 
                           f1_score, PrecisionRecallDisplay, RocCurveDisplay)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_recall_curve


mlflow.set_tracking_uri('https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow')
mlflow.set_experiment("Fraud_Detection_LogisticRegression")

X = train.drop('isFraud', axis=1)
y = train['isFraud']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

fraud_pipeline = ImbPipeline([
    ('drop_missing', DropMissing(threshold=90)),
    ('fill_na', FillNaN(cat_columns=cat_columns, num_columns=num_columns)),
    ('remove_outliers', RemoveOutliers(num_columns=num_columns, threshold=[0.01, 0.01])),
    ('binary_encoder', BinaryNonBinaryEncoder(binary_threshold=2)),
    ('correlation_remover', CorrelationRemover(threshold=0.85)),
    ('feature_selector', XGBRFE(n_features_to_select=int(len(X_train.columns)*0.8))),
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(sampling_strategy=0.5, random_state=42)),  # Oversample minority to 50% of majority
    ('undersampler', RandomUnderSampler(sampling_strategy=0.8, random_state=42)),  # Reduce majority
    ('classifier', LogisticRegression(
        class_weight='balanced',
        penalty='l1',
        solver='liblinear',
        C=0.1,
        random_state=42,
        max_iter=1000
    ))
])

with mlflow.start_run(run_name="LogisticRegression_With_Imbalance_Handling"):
    mlflow.log_params({
        "sampling_strategy": "SMOTE(0.5) + RandomUnderSampler(0.8)",
        "class_weight": "balanced",
        "penalty": "l1",
        "C": 0.1,
        "solver": "liblinear"
    })
    
    fraud_pipeline.fit(X_train, y_train)
    
    pre_sampling_pipeline = fraud_pipeline[:-3]
    X_train_transformed = pre_sampling_pipeline.transform(X_train)
    
    if not isinstance(X_train_transformed, pd.DataFrame):
        try:
            selected_features = fraud_pipeline.named_steps['feature_selector'].selected_features_
            X_train_transformed = pd.DataFrame(X_train_transformed, columns=selected_features)
        except:
            X_train_transformed = pd.DataFrame(X_train_transformed, 
                                             columns=[f"feature_{i}" for i in range(X_train_transformed.shape[1])])
    
    y_pred_proba = fraud_pipeline.predict_proba(X_val)[:, 1]
    y_pred = fraud_pipeline.predict(X_val)
    
    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
    f1_scores = 2*(precision*recall)/(precision+recall)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
    
    metrics = {
        "val_auc": roc_auc_score(y_val, y_pred_proba),
        "val_ap": average_precision_score(y_val, y_pred_proba),
        "val_f1": f1_score(y_val, y_pred_optimal),
        "val_f1_default_threshold": f1_score(y_val, y_pred),
        "optimal_threshold": optimal_threshold,
        "fraud_precision": precision_score(y_val, y_pred_optimal),
        "fraud_recall": recall_score(y_val, y_pred_optimal),
        "fraud_precision_default": precision_score(y_val, y_pred),
        "fraud_recall_default": recall_score(y_val, y_pred)
    }
    
    mlflow.log_metrics(metrics)
    
    mlflow.log_dict(classification_report(y_val, y_pred_optimal, output_dict=True), 
                   "optimal_threshold_classification_report.json")
    mlflow.log_dict(classification_report(y_val, y_pred, output_dict=True), 
                   "default_threshold_classification_report.json")
    
    if hasattr(fraud_pipeline.named_steps['classifier'], 'coef_'):
        coefs = fraud_pipeline.named_steps['classifier'].coef_[0]
        feature_importance = pd.DataFrame({
            'feature': X_train_transformed.columns,
            'importance': np.abs(coefs)
        }).sort_values('importance', ascending=False)
        
        mlflow.log_dict(feature_importance.head(20).to_dict(), "top_features.json")
        
    
    mlflow.sklearn.log_model(
        fraud_pipeline,
        "fraud_detection_pipeline",
        registered_model_name="Fraud_Logistic_Pipeline_With_Imbalance_Handling"
    )
    
    print(f"Training complete. Optimal F1 Score: {metrics['val_f1']:.4f}")
    print(f"Optimal threshold: {optimal_threshold:.4f}")

dropped missing
filled nan
removed outliers
encoded


  return op(a, b)


removed correlations
Starting optimized feature selection...
Phase 1: Quick elimination of worst features
Phase 2: Final feature selection
Completed. Selected 200 features.


Successfully registered model 'Fraud_Logistic_Pipeline_With_Imbalance_Handling'.
2025/04/29 08:42:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Fraud_Logistic_Pipeline_With_Imbalance_Handling, version 1
Created version '1' of model 'Fraud_Logistic_Pipeline_With_Imbalance_Handling'.


Training complete. Optimal F1 Score: 0.5227
Optimal threshold: 0.8968
🏃 View run LogisticRegression_With_Imbalance_Handling at: https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow/#/experiments/4/runs/c43c9ea1103045c28577044b117f88e3
🧪 View experiment at: https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow/#/experiments/4
