In [1]:
# Credit Card Fraud Detection   https://www.kaggle.com/mlg-ulb/creditcardfraud

# Goal of this model to detect the fradulent CC transacations

# LogisticRegression & Random Forest 

In [2]:
# Read the csv file and cache the data
from pyspark.sql import SQLContext

from pyspark import SparkContext

sqlContext = SQLContext(sc)

df = sqlContext.read.csv('/FileStore/tables/creditcard.csv',header='true', inferSchema='true')
df.cache()

In [3]:
# Look at the count of distinct records: Normal vs Fraudelent transactions;
df.select("Class").distinct().show()
df.groupBy("Class").count().show()


In [4]:
# Drop the Time feature as we are not relying on it
# Rename the class to label for BinaryClassificationEvaluator

df = df.drop("Time")
df = df.withColumnRenamed("Class", "label")

In [5]:
from  pyspark.ml.feature import StandardScaler
from  pyspark.ml.feature import VectorAssembler

# Vectorize the amount column
amt_assembler = VectorAssembler(inputCols = ['Amount'],outputCol = "vec_Amount")

df = amt_assembler.transform(df)




In [6]:
# Now Standardize the Amount column, iternal Stats function 
standardizer = StandardScaler(withMean=True, withStd=True,inputCol='vec_Amount',outputCol='std_Amount')
model = standardizer.fit(df)
df = model.transform(df)

In [7]:
# vectorize all the features columns
sel_cols = ['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','std_Amount']
assembler = VectorAssembler(inputCols = sel_cols,outputCol = "features")
df = assembler.transform(df)

In [8]:
# Split the dataset into training/testing for Random Classifier, seed is justa random sample data fucntion 
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
selected = predictions.select("label", "prediction", "probability", "Amount")
selected.show()

In [9]:
# Now evaluate the model 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})

In [10]:
evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

In [11]:
# Now build the Logistic Regression 
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})


In [12]:
evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
# Area Under PR is less for logistic Regression; hence RandomForest is the best model

In [13]:
# Cross validate the RandomForest model to get best model
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid,
evaluator=evaluator, numFolds=10)
cvModel = cv.fit(trainingData)
predictions = cvModel.transform(testData)
evaluator.evaluate(predictions)



In [14]:
# Area under PR is slighly improved, let's pick the bestModel
evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})


In [15]:
bestModel = cvModel.bestModel
finalPredictions = bestModel.transform(df)
evaluator.evaluate(finalPredictions)

In [16]:
evaluator.evaluate(finalPredictions, {evaluator.metricName: "areaUnderPR"})

