In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [34]:
!pip install mlflow dagshub
import dagshub
import mlflow
dagshub.init(repo_owner='skara-21', repo_name='Assignment2_Fraud_Detection', mlflow=True)



In [35]:
train_id=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_trans=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [36]:
train=pd.merge(train_trans,train_id,on='TransactionID',how='left')

In [37]:
del train_trans,train_id

In [38]:
cat_columns=['ProductCD','addr1','addr2','P_emaildomain',
             'R_emaildomain','DeviceType','DeviceInfo',
            'user_has_ever_been_fraud','userId']
cat_columns+=['card'+str(i) for i in range(1,7)]
cat_columns+=['M'+str(i) for i in range(1,10)]
cat_columns+=['id_'+str(i) for i in range(12,39)]

num_columns=[col for col in train.columns.tolist() if col not in cat_columns and col != 'isFraud']

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin

In [40]:
class DropMissing(BaseEstimator, TransformerMixin):
    def __init__(self,threshold=90):
        self.to_remove=None

    def fit(self,X, y=None):
        cols_with_missing = [col for col in X.columns
                         if X[col].isnull().any()]
        to_remove=[]
        for col in cols_with_missing:
            null_percentile=(X[col].isnull().mean())*100
            if null_percentile>90: to_remove.append(col)
        self.to_remove=to_remove
        print('dropped missing')
        return self

    def transform(self,X):
        if self.to_remove==None:
            raise Error('you must run fit method first to drop columns with missing values')
            return X
        else:
            return X.drop(self.to_remove,axis=1)

In [41]:
class FillNaN(BaseEstimator, TransformerMixin):
    def __init__(self,cat_columns,num_columns,threshold=90):
        self.to_remove=None
        self.num_columns=num_columns
        self.cat_columns=cat_columns
    def fit(self,X,y=None):
        X_cp=X.copy()
        to_remove={}
        for col in self.num_columns:
            if col in X_cp.columns:
                mode_val = X_cp[col].mode()[0]
                X_cp.fillna({col:mode_val}, inplace=True)
                to_remove[col]=mode_val
        self.to_remove = to_remove
        for col in self.cat_columns:
            X_cp.fillna({col:'missing'}, inplace=True)

        print('filled nan')
        return self

    def transform(self,X):
        if self.to_remove==None:
            raise Error('you must run fit method first to fill NaN values')
        else:
            X_cp=X.copy()
            for col,val in self.to_remove.items():
                X_cp.fillna({col:val}, inplace=True)
            return X_cp

In [42]:
from scipy.stats.mstats import winsorize
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self,num_columns,threshold=[0.01,0.01]):
        self.num_columns=num_columns
        self.threshold=threshold
    def fit(self,X,y=None):
        for col in self.num_columns:
            if col in X.columns:
                X[col]=winsorize(X[col], limits=self.threshold)

        print('removed outliers')
        return self
    def transform(self,X):
        for col in self.num_columns:
            if col in X.columns:
                X[col]=winsorize(X[col], limits=self.threshold)
        return X

In [43]:
from category_encoders import WOEEncoder

class BinaryNonBinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, binary_threshold=2):
        self.binary_threshold = binary_threshold
        self.binary_cols_ = None
        self.non_binary_cols_ = None
        self.woe_encoder_ = None
        self.ohe_columns_ = None
        
    def fit(self, X, y):
        self.binary_cols_ = [col for col in X.columns 
                           if X[col].nunique() <= self.binary_threshold]
        self.non_binary_cols_ = [col for col in X.columns 
                               if X[col].nunique() > self.binary_threshold]
        
        if self.non_binary_cols_:
            self.woe_encoder_ = WOEEncoder(
                cols=self.non_binary_cols_,

            )
            self.woe_encoder_.fit(X[self.non_binary_cols_], y)
        
        if self.binary_cols_:
            dummy_df = pd.get_dummies(X[self.binary_cols_], drop_first=True, dtype=int)
            self.ohe_columns_ = dummy_df.columns.tolist()

        print('encoded')
        
        return self
        
    def transform(self, X):
        X = X.copy()
        if self.non_binary_cols_ and self.woe_encoder_:
            X[self.non_binary_cols_] = self.woe_encoder_.transform(X[self.non_binary_cols_])
        
        if self.binary_cols_:
            dummy_df = pd.get_dummies(X[self.binary_cols_], drop_first=True, dtype=int)
            
            missing_cols = set(self.ohe_columns_) - set(dummy_df.columns)
            for col in missing_cols:
                dummy_df[col] = 0
            
            dummy_df = dummy_df.reindex(columns=self.ohe_columns_, fill_value=0)
            
            X = pd.concat([
                X.drop(columns=self.binary_cols_),
                dummy_df
            ], axis=1)
        
        return X
    
    def fit_transform(self, X, y=None, **fit_params):
        if y is None:
            raise ValueError("y cannot be None for WOE encoding")
        return self.fit(X, y).transform(X)

In [44]:
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class XGBRFE(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 n_features_to_select=150,
                 n_estimators=100,  # Reduced from 200
                 max_depth=6,       # Reduced from 8
                 learning_rate=0.05, # Reduced from 0.1
                 subsample=0.8,
                 colsample_bytree=0.8,
                 n_jobs=-1,
                 random_state=42,
                 verbosity=0):
        self.n_features_to_select = n_features_to_select
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbosity = verbosity
        self.selected_features_ = None
        self.support_ = None
        self.ranking_ = None

    def fit(self, X, y):
        print('Starting optimized feature selection...')
        
        # Lightweight estimator configuration
        estimator = XGBRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbosity=self.verbosity,
            tree_method='hist',  # More memory-efficient
            booster='gbtree',    # Default, but explicit
            objective='reg:squarederror'
        )
        
        # Two-phase feature selection
        if X.shape[1] > 200:  # Only if we have many features
            print('Phase 1: Quick elimination of worst features')
            initial_selector = RFE(
                estimator=estimator,
                n_features_to_select=min(200, X.shape[1]),
                step=max(1, X.shape[1]//20)  # Aggressive step size
            )
            initial_selector.fit(X, y)
            X = X.loc[:, initial_selector.support_]
        
        print('Phase 2: Final feature selection')
        final_selector = RFE(
            estimator=estimator,
            n_features_to_select=self.n_features_to_select,
            step=1  # More precise for final selection
        )
        final_selector.fit(X, y)
        
        # Store results
        if hasattr(X, 'columns'):
            self.selected_features_ = X.columns[final_selector.support_].tolist()
        self.support_ = final_selector.support_
        self.ranking_ = final_selector.ranking_
        
        print(f'Completed. Selected {len(self.selected_features_)} features.')
        return self

    def transform(self, X):
        if self.selected_features_ is None:
            raise RuntimeError("Must fit transformer before transforming")
            
        if isinstance(X, pd.DataFrame):
            return X[self.selected_features_]
        return X[:, self.support_]

    def get_support(self, indices=False):
        if indices:
            return np.where(self.support_)[0]
        return self.support_

In [45]:
class CorrelationRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.cols_to_drop_ = None
        
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            corr_matrix = X.corr().abs()
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            self.cols_to_drop_ = [col for col in X.columns 
                                if any(upper[col] > self.threshold)]

        print('removed correlations')
        return self
        
    def transform(self, X):
        if self.cols_to_drop_ is None:
            raise RuntimeError("Must fit transformer before transforming data")
        return X.drop(columns=self.cols_to_drop_)

In [46]:
print('done')

done


# Training

In [47]:
import mlflow
import mlflow.xgboost
import mlflow.sklearn

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_score, recall_score, classification_report, f1_score)

from xgboost import XGBClassifier, plot_importance



mlflow.set_tracking_uri('https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow')
mlflow.set_experiment("XGBoost_Training")

X = train.drop('isFraud', axis=1)
y = train['isFraud']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

fraud_pipeline = Pipeline([
    ('drop_missing', DropMissing(threshold=90)),
    ('fill_na', FillNaN(cat_columns=cat_columns, num_columns=num_columns)),
    ('remove_outliers', RemoveOutliers(num_columns=num_columns,threshold=[0.01, 0.01])),
    ('binary_encoder', BinaryNonBinaryEncoder(binary_threshold=2)),
    ('correlation_remover', CorrelationRemover(threshold=0.85)),
    ('feature_selector', XGBRFE(n_features_to_select=int(len(X_train.columns)*0.8))),
    ('classifier', XGBClassifier(
        scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train),
        eval_metric='aucpr',
        use_label_encoder=False,
        random_state=42
    ))
])


with mlflow.start_run(run_name="XGBoost_Fraud_Detection"):
    mlflow.log_params({
        "preprocessing_steps": [step[0] for step in fraud_pipeline.steps],
        "scale_pos_weight": (len(y_train)-sum(y_train))/sum(y_train),
        "feature_selection": "XGBRFE"
    })
    
    fraud_pipeline.fit(X_train, y_train)
    
    y_pred_proba = fraud_pipeline.predict_proba(X_val)[:, 1]
    y_pred = fraud_pipeline.predict(X_val)
    
    f1 = f1_score(y_val, y_pred)
    
    mlflow.log_metrics({
        "val_auc": roc_auc_score(y_val, y_pred_proba),
        "val_ap": average_precision_score(y_val, y_pred_proba),
        "val_f1": f1,
        "fraud_precision": precision_score(y_val, y_pred),
        "fraud_recall": recall_score(y_val, y_pred, pos_label=1)
    })
    
    imbalance_report = classification_report(y_val, y_pred, output_dict=True)
    mlflow.log_dict(imbalance_report, "imbalance_classification_report.json")
    
    fig, ax = plt.subplots(figsize=(10, 8))
    plot_importance(fraud_pipeline.named_steps['classifier'], ax=ax, max_num_features=20)
    plt.tight_layout()
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()
    
    mlflow.sklearn.log_model(
        fraud_pipeline,
        "fraud_detection_pipeline",
        registered_model_name="Fraud_XGBoost_Pipeline"
    )
    
    print(f"Training complete. F1 Score: {f1:.4f}")

dropped missing
filled nan
removed outliers
encoded


  return op(a, b)


removed correlations
Starting optimized feature selection...
Phase 1: Quick elimination of worst features
Phase 2: Final feature selection
Completed. Selected 200 features.


Registered model 'Fraud_XGBoost_Pipeline' already exists. Creating a new version of this model...
2025/04/29 07:11:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Fraud_XGBoost_Pipeline, version 2
Created version '2' of model 'Fraud_XGBoost_Pipeline'.


Training complete. F1 Score: 0.4184
🏃 View run XGBoost_Fraud_Detection at: https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow/#/experiments/0/runs/2a993601adc34b838b8de9b15c6a4fe1
🧪 View experiment at: https://dagshub.com/skara-21/Assignment2_Fraud_Detection.mlflow/#/experiments/0
