In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.streaming import StreamingContext

In [2]:
spark = SparkSession \
        .builder \
        .appName("FraudDetection") \
        .getOrCreate()

In [3]:
data = spark.read.format("csv").load("mobilelog.csv",header=True, inferSchema=True)

In [4]:
data.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [5]:
data.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M123070170

In [6]:
indexer = StringIndexer(inputCol="type",outputCol="typeindex")
indexed = indexer.fit(data).transform(data)

In [7]:
indexed.select("type","typeindex").distinct().show()

+--------+---------+
|    type|typeindex|
+--------+---------+
|TRANSFER|      3.0|
|CASH_OUT|      0.0|
| CASH_IN|      2.0|
| PAYMENT|      1.0|
|   DEBIT|      4.0|
+--------+---------+



In [8]:
from pyspark.ml.feature import OneHotEncoder
encoder1 = OneHotEncoder(inputCol="typeindex",
                        outputCol="typeindexencoded")
encoded1 = encoder1.transform(indexed)
encoded1.show()

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|typeindex|typeindexencoded|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+---------+----------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|      1.0|   (4,[1],[1.0])|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|      1.0|   (4,[1],[1.0])|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|      3.0|   (4,[3],[1.0])|
|   1|CASH_OUT|    181.0| C840083671|        1

In [9]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["typeindexencoded","step","amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest","isFlaggedFraud"],outputCol="features")
assembled = assembler.transform(encoded1)
assembled[['features']].show()

+--------------------+
|            features|
+--------------------+
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[3,4,5,6],[1....|
|(11,[0,4,5,6,8],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6],[1....|
|(11,[4,5,6,7,8,9]...|
|(11,[4,5,6,8,9],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6],[1....|
|(11,[1,4,5,6,7],[...|
|(11,[0,4,5,6,8,9]...|
|(11,[1,4,5,6],[1....|
|(11,[1,4,5,6,7],[...|
|(11,[1,4,5,6,7],[...|
|(11,[3,4,5,6,8],[...|
+--------------------+
only showing top 20 rows



In [10]:
train_data , test_data = assembled.randomSplit([0.7,0.3])

In [11]:
lr = LogisticRegression(featuresCol='features',labelCol='isFraud')
model = lr.fit(train_data)

In [12]:
predict = model.transform(test_data)
predict[['type','isFraud','prediction','probability']].show()

+-------+-------+----------+-----------+
|   type|isFraud|prediction|probability|
+-------+-------+----------+-----------+
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
|CASH_IN|      0|       0.0|  [1.0,0.0]|
+-------+-------+----------+-----------+
only showing top

In [13]:
final = predict.select('type','isFraud','prediction').where("prediction = 1.0").show(100)

+--------+-------+----------+
|    type|isFraud|prediction|
+--------+-------+----------+
|TRANSFER|      1|       1.0|
|CASH_OUT|      1|       1.0|
|CASH_OUT|      1|       1.0|
|TRANSFER|      1|       1.0|
|TRANSFER|      1|       1.0|
|CASH_OUT|      1|       1.0|
|TRANSFER|      0|       1.0|
|CASH_OUT|      0|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|TRANSFER|      1|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      1|       1.0|
|TRANSFER|      1|       1.0|
|TRANSFER|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|CASH_OUT|      0|       1.0|
|TRANSFER|      1|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|      0|       1.0|
|TRANSFER|

In [14]:
#predict.select('type','isFraud','prediction').write.save("hdfs://nameservice1/user/edureka_1118556/FraudDetection",format="csv")

AnalysisException: u'path hdfs://nameservice1/user/edureka_1118556/FraudDetection already exists.;'

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='isFraud')
AUC = my_eval.evaluate(predict)

In [16]:
AUC

0.7622510887884176

In [21]:
streaming = StreamingContext(spark.sparkContext,60)

In [22]:
model.save("hdfs://nameservice1/user/edureka_1118556/FraudDetectiontrainedmodel")