In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_auc_score,roc_curve,confusion_matrix,auc
import xgboost as xgb

In [None]:
pip install mlflow boto3

In [None]:
import boto3
print(boto3.__version__)

In [None]:
import os

# Set the AWS profile to use
os.environ['AWS_PROFILE'] = 'ml_user'

In [None]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Fraud detection")

In [2]:
def load_data(training_data_path,test_data_path):
    train_data=pd.read_csv(training_data_path)
    test_data=pd.read_csv(test_data_path)
    return train_data,test_data

In [None]:
training_data_path=''
test_data_path=''
df_train.df_test=load_data((training_data_path,test_data_path)

In [3]:
def preprocess_numerical_features(df,drop_col=None):
    
    START_DATE='2017-12-01'
    startdate=datetime.datetime.strptime(START_DATE,"%Y-%m-%d")
    #convert transactional date into actual date
    df["Date"]=df['TransactionDT'].apply(lambda x:(startdate + datetime.timedelta(seconds=x)))
    #extract features
    df['weekdays']=df['Date'].dt.dayofweek
    df['Hour']=df['Date'].dt.hour
    df['Day']=df['Date'].dt.day
    
    #log transform of transaction amount-to redice skew
    df['TransactionAmt']= np.log(df['TransactionAmt'])
    
    #drop transactional date column
    
    if drop_cols:
        df.drop(drop_cols,axis=1,inplace=True)
    
    return df

In [None]:
drop_col=['TransactionDT']
df_train=preprocess_numerical_features(df_train,drop_col=drop_col)
df_test=preprocess_numerical_features(df_test,drop_col=drop_col)

In [None]:
def encode_categorical_features(train_df,test_df):
    train_df=train_df.copy
    test_df=test_df.copy
    
    for col in train_df.columns:
        if train_df[col].dtype == 'object':
            #fill nan in both
            train_df[col] = train_df[col].fillna('Unknown')
            test_df[col] = test_df[col].fillna('Unknown')
            
            #fit label encoder
            le=LabelEncoder()
            train_df['col']=le.fit_transform(train_Df[col].astype(str))
            test_df['col']=le.fit_transform(test_Df[col].astype(str))
            
    return train_df,test_df    

In [None]:
train,test=encode_categorical_features(df_train,df_test)

In [None]:
X=train.drop('isFraud',axis=1)
y=train['isFraud']
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=0)


In [None]:
#calculate class weight for imbalance data
scale_pos_weight =  (y == 0).sum() / (y == 1).sum()

In [None]:
with mlflow.start_run():
    params=dict(n_estimators=500,
                max_depth=9,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.9,
                tree_method='gpu_hist')
    mlflow.log_params(params)
    model=xgb.XGBClassifier(
        **params,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        eval_metric='logloss',
        missing=np.nan)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
        
    auc_score = roc_auc_score(y_val, y_pred_proba)
    print(f" Model trained! AUC Score: {auc_score:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Log the model to MLflow
    mlflow.xgboost.log_model(model, artifact_path="model")
        