In [1]:
import findspark
findspark.init() 

In [2]:
from datetime import datetime
import subprocess

import pyspark
import mlflow
from mlflow.tracking import MlflowClient
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import hour, minute, second, year, month, dayofmonth, dayofweek, count, to_timestamp, when, isnan
from pyspark.sql.functions import col, isnan, when, count
from pyspark.sql.functions import countDistinct, udf
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [3]:
# def run_cmd(args_list):
#     """
#     run linux commands
#     """
#     # import subprocess
#     print('Running system command: {0}'.format(' '.join(args_list)))
#     proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#     s_output, s_err = proc.communicate()
#     s_return =  proc.returncode
#     return s_return, s_output, s_err 

# def get_files_list():        
#     # Get file list from hdfs
#     (ret, out, err) = run_cmd(['hdfs', 'dfs', '-ls', '-C', '/user/fraud-data'])
#     hdfs_files = [line for line in out.decode().split('\n') if len(line)]
#     hdfs_files.sort()
#     return hdfs_files


# def read_files(file_list):
#     # Define the schema for the DataFrame
#     schema = StructType([
#         StructField("transaction_id", IntegerType(), True),
#         StructField("tx_datetime", StringType(), True),
#         StructField("customer_id", IntegerType(), True),
#         StructField("terminal_id", IntegerType(), True),
#         StructField("tx_amount", DoubleType(), True),
#         StructField("tx_time_seconds", IntegerType(), True),
#         StructField("tx_time_days", IntegerType(), True),
#         StructField("tx_fraud", IntegerType(), True),
#         StructField("tx_fraud_scenario", IntegerType(), True)
#     ])

#     # Load the CSV file into a DataFrame
#     df = (spark.read
#         .format("csv")
#         .schema(schema)
#         .option("header", False)
#         .option("sep", ',')
#         .option("comment", '#')
#         .load(file_list)
#     )
#     return df


# # Define a UDF to handle the special case
# def convert_timestamp(s):
#     if s[11:13] == '24':
#         return s[:11] + '00' + s[13:]
#     return s


# def preprocess(df):
#     # Convert the tx_datetime column to a timestamp type
#     df = df.limit(df.count() // 10)
#     convert_timestamp_udf = udf(convert_timestamp)
#     df = df.withColumn("tx_datetime", convert_timestamp_udf(df["tx_datetime"]))
#     df = df.withColumn("ts", to_timestamp(df["tx_datetime"], "yyyy-MM-dd HH:mm:ss"))
    
#     df = df.fillna({'terminal_id': 0})
    
#     # Extract new features from the tx_datetime column
#     df = df.withColumn("is_weekend", dayofweek("ts").isin([1,7]).cast("int"))
#     #df = df.withColumn("year", year(df["ts"]))
#     #df = df.withColumn("month", month(df["ts"]))
#     df = df.withColumn("day_of_month", dayofmonth(df["ts"]))
#     df = df.withColumn("day_of_week", dayofweek(df["ts"]))
#     df = df.withColumn("hour", hour(df["ts"]))
#     df = df.withColumn("minute", minute(df["ts"]))
#     df = df.withColumn("second", second(df["ts"]))
    
#     return df

def calculate_accuracy(predictions):
    predictions = predictions.withColumn(
        "fraudPrediction",
        when((predictions.tx_fraud==1) & (predictions.prediction==1), 1).otherwise(0)
    )

    accurateFraud = predictions.groupBy("fraudPrediction").count().where(predictions.fraudPrediction==1).head()[1]
    totalFraud = predictions.groupBy("tx_fraud").count().where(predictions.tx_fraud==1).head()[1]
    accuracy = (accurateFraud/totalFraud)*100
    return accuracy

In [4]:
# Main

if __name__ == "__main__":
    spark = (
        pyspark.sql.SparkSession.builder
            #.config('spark.executor.instances', 8)
            .config("spark.executor.cores", 4)
            .appName("fraud_data_validate")
            .getOrCreate()
    )
    
    df = spark.read.parquet("/user/transformed_full/")
    df_validate = df.filter(col('ts').between("2019-10-28", "2019-11-05"))
    
    # Prepare MLFlow experiment for logging
    client = MlflowClient()
    experiment = client.get_experiment_by_name("Fraud_Data_Validate")
    experiment_id = experiment.experiment_id

    run_name = 'Fraud_data_validate' + ' ' + str(datetime.now())

    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        # Load models
        model_latest = mlflow.spark.load_model(model_uri=f"models:/fraud_classifier/latest")
        model_staging = mlflow.spark.load_model(model_uri=f"models:/fraud_classifier/Staging")
        evaluator = BinaryClassificationEvaluator(labelCol='tx_fraud', rawPredictionCol='prediction')

        # Perform inference via model.transform()
        predictions_latest = model_latest.transform(df_validate)
        areaUnderROC_latest = evaluator.evaluate(predictions_latest)
        accuracy_latest = calculate_accuracy(predictions_latest)
        predictions_staging = model_staging.transform(df_validate)
        areaUnderROC_staging = evaluator.evaluate(predictions_staging)
        accuracy_staging = calculate_accuracy(predictions_staging)

        run_id = mlflow.active_run().info.run_id
        print(f"Logging metrics to MLflow run {run_id} ...")
        mlflow.log_metric("ROC-latest", areaUnderROC_latest)
        mlflow.log_metric("Acc-latest", accuracy_latest)
        print(f"Model ROC-latest: {areaUnderROC_latest}")
        print(f"Model Acc-latest: {accuracy_latest}")
        
        mlflow.log_metric("ROC-staging", areaUnderROC_staging)
        mlflow.log_metric("Acc-staging", accuracy_staging)
        print(f"Model ROC-staging: {areaUnderROC_staging}")
        print(f"Model Acc-staging: {accuracy_staging}")
        
    spark.stop()

2023/06/19 16:35:20 INFO mlflow.spark: 'models:/fraud_classifier/latest' resolved as 's3://mlops-hw/2/3837f9ac5093472ebcc16699699354ad/artifacts/fraud_classifier'
2023/06/19 16:35:21 INFO mlflow.spark: URI 'models:/fraud_classifier/latest/sparkml' does not point to the current DFS.
2023/06/19 16:35:21 INFO mlflow.spark: File 'models:/fraud_classifier/latest/sparkml' not found on DFS. Will attempt to upload the file.
2023/06/19 16:35:21 INFO mlflow.spark: Copied SparkML model to /tmp/mlflow/da6569a6-20ff-44a0-842c-3e61dc67492a
2023/06/19 16:35:25 INFO mlflow.spark: 'models:/fraud_classifier/Staging' resolved as 's3://mlops-hw/2/349ee05a17c847ac829636098a4a0a9f/artifacts/fraud_classifier'
2023/06/19 16:35:26 INFO mlflow.spark: URI 'models:/fraud_classifier/Staging/sparkml' does not point to the current DFS.
2023/06/19 16:35:26 INFO mlflow.spark: File 'models:/fraud_classifier/Staging/sparkml' not found on DFS. Will attempt to upload the file.
2023/06/19 16:35:27 INFO mlflow.spark: Copied

Logging metrics to MLflow run 4f518306d7864a0d8038b571b6550089 ...
Model ROC-latest: 0.8913836677432935
Model Acc-latest: 78.73607373858214
Model ROC-staging: 0.889044246825055
Model Acc-staging: 78.23287480602342


In [5]:
# # Main

# spark = (
#     pyspark.sql.SparkSession.builder
#         .appName("fraud_data_validate")
#         .getOrCreate()
# )
# spark.conf.set('spark.sql.repl.eagerEval.enabled', True)  # to pretty print pyspark.DataFrame in jupyter

# # Read available files
# new_files = get_files_list()
# print(new_files)

# df = read_files(new_files[1])
# df = preprocess(df)

# # Prepare MLFlow experiment for logging
# client = MlflowClient()
# experiment = client.get_experiment_by_name("Fraud_Data_Validate")
# experiment_id = experiment.experiment_id

# run_name = 'Fraud_data_validate' + ' ' + str(datetime.now())

# with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
#     test = df.sample(0.1)

#     # Load models
#     model_latest = mlflow.spark.load_model(model_uri=f"models:/fraud_classifier/latest")
#     model_staging = mlflow.spark.load_model(model_uri=f"models:/fraud_classifier/Staging")
#     evaluator = BinaryClassificationEvaluator(labelCol='tx_fraud', rawPredictionCol='prediction')

#     # Perform inference via model.transform()
#     predictions_latest = model_latest.transform(test)
#     areaUnderROC_latest = evaluator.evaluate(predictions_latest)
#     predictions_staging = model_staging.transform(test)
#     areaUnderROC_staging = evaluator.evaluate(predictions_staging)

#     run_id = mlflow.active_run().info.run_id
#     print(f"Logging metrics to MLflow run {run_id} ...")
#     mlflow.log_metric("ROC-latest", areaUnderROC_latest)
#     print(f"Model ROC-latest: {areaUnderROC_latest}")
#     mlflow.log_metric("ROC-staging", areaUnderROC_staging)
#     print(f"Model ROC-staging: {areaUnderROC_staging}")