In [1]:
# AUDIT_RISK DATA SET MACHINE LEARNING APPLICATION

# In this notebook i performed Extract-Transform-Load and Exploratory Data Analysis on a real-world dataset, and then applying several different machine learning algorithms to solve a supervised regression problem on the dataset.

# This notebook covers:

# Part 1:  Understanding
# Part 2: Load Your Data
# Part 3: Explore Your Data
# Part 4: Visualize Your Data
# Part 5: Data Preparation
# Part 6: Data Modeling
# Part 7: Tuning and Evaluation
  
# Our goal is to accurately predict audit_risk output.

In [2]:
# Now that we understand what we are trying to do, the first step is to load our data into a format we can query and use. This is known as ETL or "Extract-Transform-Load". 

spark.read.text("/FileStore/tables/audit_risk.csv").show()

In [3]:
# importing required libraries

{
  "cluster_id": "analytics",
  "libraries": [
    {
      "jar": "dbfs:/mnt/libraries/library.jar"
    },
    {
      "egg": "dbfs:/mnt/libraries/library.egg"
    },
    {
      "whl": "dbfs:/mnt/libraries/mlflow-0.0.1.dev0-py2-none-any.whl"
    },
    {
      "whl": "dbfs:/mnt/libraries/wheel-libraries.wheelhouse.zip"
    },
    {
      "maven": {
        "coordinates": "org.jsoup:jsoup:1.7.2",
        "exclusions": ["slf4j:slf4j"]
      }
    },
    {
      "pypi": {
        "package": "simplejson",
        "repo": "http://my-pypi-mirror.com"
      }
    },
    {
      "cran": {
        "package": "ada",
        "repo": "http://cran.us.r-project.org"
      }
    }
  ]
}

In [4]:
# Displaying all files in dbfs file system

display(dbutils.fs.ls("/FileStore/tables"))


path,name,size
dbfs:/FileStore/tables/13b_Linear_Regression_exercises-0dba5.ipynb,13b_Linear_Regression_exercises-0dba5.ipynb,14670
dbfs:/FileStore/tables/Telco_Customer_Churn-19d28.csv,Telco_Customer_Churn-19d28.csv,977501
dbfs:/FileStore/tables/audit_risk.csv,audit_risk.csv,81197
dbfs:/FileStore/tables/data.csv,data.csv,9140113
dbfs:/FileStore/tables/diabete.csv,diabete.csv,23875
dbfs:/FileStore/tables/diabetes.csv,diabetes.csv,23873
dbfs:/FileStore/tables/heart.csv,heart.csv,11328
dbfs:/FileStore/tables/test.csv,test.csv,59054007
dbfs:/FileStore/tables/train.csv,train.csv,59362806


In [5]:
# Loading the data file into dataframe

dataDF = (sqlContext.read    
    .format("com.databricks.spark.csv") # use spark.csv package
    .option("header", "true") # Use first line of all files as header
    .option("inferSchema", "true") # Automatically infer data types
    .option("delimiter", ',') # Specify the delimiter as Tab or '\t'
    .load("/FileStore/tables/audit_risk.csv")) 



In [6]:
display(dbutils.fs)

In [7]:
# displaying data frame 

display(dataDF)

Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,0.6,3.6,11.75,0.6,7.05,2,0.2,0.4,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1
3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
3.89,6,0.0,0.2,0.0,0.83,0.2,0.166,0.83,5.0,0.2,1.0,2.95,0.2,0.59,2,0.2,0.4,0,0.2,0.0,2.0,2.156,0.4,0.5,0.4312,0
3.89,7,1.1,0.4,0.44,7.41,0.4,2.964,8.51,5.0,0.2,1.0,44.95,0.6,26.97,2,0.2,0.4,0,0.2,0.0,3.2,31.774,0.4,0.5,6.3548,1
3.89,8,8.5,0.6,5.1,12.03,0.6,7.218,20.53,5.5,0.4,2.2,7.79,0.4,3.116,2,0.2,0.4,0,0.2,0.0,4.2,18.034,0.4,0.5,3.6068,1
3.89,8,8.4,0.6,5.04,11.05,0.6,6.63,19.45,5.5,0.4,2.2,7.34,0.4,2.936,2,0.2,0.4,0,0.2,0.0,4.2,17.206,0.4,0.5,3.4412,1
3.89,8,3.98,0.6,2.388,0.99,0.2,0.198,4.97,5.0,0.2,1.0,1.93,0.2,0.386,2,0.2,0.4,0,0.2,0.0,2.4,4.372,0.4,0.5,0.8744,0


In [8]:
# printing data frame data types

print (dataDF.dtypes)



In [9]:
display(dataDF)

Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,0.6,3.6,11.75,0.6,7.05,2,0.2,0.4,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1
3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
3.89,6,0.0,0.2,0.0,0.83,0.2,0.166,0.83,5.0,0.2,1.0,2.95,0.2,0.59,2,0.2,0.4,0,0.2,0.0,2.0,2.156,0.4,0.5,0.4312,0
3.89,7,1.1,0.4,0.44,7.41,0.4,2.964,8.51,5.0,0.2,1.0,44.95,0.6,26.97,2,0.2,0.4,0,0.2,0.0,3.2,31.774,0.4,0.5,6.3548,1
3.89,8,8.5,0.6,5.1,12.03,0.6,7.218,20.53,5.5,0.4,2.2,7.79,0.4,3.116,2,0.2,0.4,0,0.2,0.0,4.2,18.034,0.4,0.5,3.6068,1
3.89,8,8.4,0.6,5.04,11.05,0.6,6.63,19.45,5.5,0.4,2.2,7.34,0.4,2.936,2,0.2,0.4,0,0.2,0.0,4.2,17.206,0.4,0.5,3.4412,1
3.89,8,3.98,0.6,2.388,0.99,0.2,0.198,4.97,5.0,0.2,1.0,1.93,0.2,0.386,2,0.2,0.4,0,0.2,0.0,2.4,4.372,0.4,0.5,0.8744,0


In [10]:
# Here,we are registering dataframe as table into dbfs

sqlContext.sql("DROP TABLE IF EXISTS audit_risk")
dbutils.fs.rm("dbfs:/FileStore/tables/audit_risk", True)
sqlContext.registerDataFrameAsTable(dataDF, "audit_risk")

In [11]:
# query to display all columns of a table

%sql
-- We can use %sql to query the rows
SELECT * FROM audit_risk

Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,0.6,3.6,11.75,0.6,7.05,2,0.2,0.4,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1
3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
3.89,6,0.0,0.2,0.0,0.83,0.2,0.166,0.83,5.0,0.2,1.0,2.95,0.2,0.59,2,0.2,0.4,0,0.2,0.0,2.0,2.156,0.4,0.5,0.4312,0
3.89,7,1.1,0.4,0.44,7.41,0.4,2.964,8.51,5.0,0.2,1.0,44.95,0.6,26.97,2,0.2,0.4,0,0.2,0.0,3.2,31.774,0.4,0.5,6.3548,1
3.89,8,8.5,0.6,5.1,12.03,0.6,7.218,20.53,5.5,0.4,2.2,7.79,0.4,3.116,2,0.2,0.4,0,0.2,0.0,4.2,18.034,0.4,0.5,3.6068,1
3.89,8,8.4,0.6,5.04,11.05,0.6,6.63,19.45,5.5,0.4,2.2,7.34,0.4,2.936,2,0.2,0.4,0,0.2,0.0,4.2,17.206,0.4,0.5,3.4412,1
3.89,8,3.98,0.6,2.388,0.99,0.2,0.198,4.97,5.0,0.2,1.0,1.93,0.2,0.386,2,0.2,0.4,0,0.2,0.0,2.4,4.372,0.4,0.5,0.8744,0


In [12]:
# This command displays all the columns data types
%sql
desc audit_risk

col_name,data_type,comment
Sector_score,double,
LOCATION_ID,string,
PARA_A,double,
Score_A,double,
Risk_A,double,
PARA_B,double,
Score_B6,double,
Risk_B,double,
TOTAL,double,
numbers,double,


In [13]:
# This displays all the statistics of each column in a table

df = sqlContext.table("audit_risk")
display(df.describe())

summary,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
count,776.0,776,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,775.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0
mean,20.184536082474136,14.856403622250971,2.450194072164947,0.3512886597938091,1.351028505154639,10.799988402061862,0.3131443298969018,6.334007989690723,13.218481443298984,5.067654639175258,0.2237113402061835,1.152963917525773,14.13763096774195,0.2909793814432933,8.265434020618548,2.5051546391752577,0.2061855670103085,0.5190721649484478,0.104381443298969,0.2167525773195862,0.0536082474226803,2.7025773195876344,17.680612268041227,0.5726804123711269,0.5,7.1681583402061895,0.393041237113402
stddev,24.319017128722187,9.891317488621631,5.678870370534314,0.1740549091715534,3.440446577753995,50.0836236086675,0.1698041674136246,30.072845015575087,51.31282925864575,0.2644486203973533,0.0803517375624093,0.5374169799227397,66.60651928510502,0.15974518285474,39.9708490031252,1.2286784830725832,0.0375080082348103,0.2903118055865447,0.5310307725641172,0.0679868724979942,0.3058354963833028,0.8589226900963522,54.74024380470148,0.4445814628240786,0.0,38.667493911584586,0.4887408742557493
min,1.85,1,0.0,0.2,0.0,0.0,0.2,0.0,0.0,5.0,0.2,1.0,0.0,0.2,0.0,2.0,0.2,0.4,0.0,0.2,0.0,2.0,1.4,0.4,0.5,0.28,0.0
max,59.85,SAFIDON,85.0,0.6,51.0,1264.63,0.6,758.778,1268.91,9.0,0.6,5.4,935.03,0.6,561.018,6.0,0.6,2.4,9.0,0.6,5.4,5.2,801.262,5.8,0.5,961.5144,1.0


In [14]:
display(df)

Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,0.2,1.0,3.38,0.2,0.676,2,0.2,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,0.2,1.0,0.94,0.2,0.188,2,0.2,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,0.6,3.6,11.75,0.6,7.05,2,0.2,0.4,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1
3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0
3.89,6,0.0,0.2,0.0,0.83,0.2,0.166,0.83,5.0,0.2,1.0,2.95,0.2,0.59,2,0.2,0.4,0,0.2,0.0,2.0,2.156,0.4,0.5,0.4312,0
3.89,7,1.1,0.4,0.44,7.41,0.4,2.964,8.51,5.0,0.2,1.0,44.95,0.6,26.97,2,0.2,0.4,0,0.2,0.0,3.2,31.774,0.4,0.5,6.3548,1
3.89,8,8.5,0.6,5.1,12.03,0.6,7.218,20.53,5.5,0.4,2.2,7.79,0.4,3.116,2,0.2,0.4,0,0.2,0.0,4.2,18.034,0.4,0.5,3.6068,1
3.89,8,8.4,0.6,5.04,11.05,0.6,6.63,19.45,5.5,0.4,2.2,7.34,0.4,2.936,2,0.2,0.4,0,0.2,0.0,4.2,17.206,0.4,0.5,3.4412,1
3.89,8,3.98,0.6,2.388,0.99,0.2,0.198,4.97,5.0,0.2,1.0,1.93,0.2,0.386,2,0.2,0.4,0,0.2,0.0,2.4,4.372,0.4,0.5,0.8744,0


In [15]:
#Displaying one column which is total from table
%sql
-- TO DO: Replace <FILL_IN> with the appropriate SQL command.
select TOTAL as tt from audit_risk

tt
6.68
4.83
0.74
10.8
0.08
0.83
8.51
20.53
19.45
4.97


In [16]:
#creating the vector assembler and slecting required fileds as input columns and we create new output column features

# TODO: Replace <FILL_IN> with the appropriate code
from pyspark.ml.feature import VectorAssembler

#datasetDF = <FILL_IN>
datasetDF = sqlContext.table('audit_risk')

vectorizer = VectorAssembler()
#vectorizer = VectorAssembler(inputCols=[], outputCol="features")

#vectorizer.setInputCols(<FILL_IN>)
vectorizer.setInputCols(["Audit_Risk" , "Risk_A","Risk_B","Risk_C"])
#vectorizer.setOutputCol(<FILL_IN>)
vectorizer.setOutputCol("features")
#vectorizer = VectorAssembler(inputCols=[], outputCol="features")

In [17]:
#Now, we split our dataset as training and test sets with 80% and 20% for application of machine learning methods

# TODO: Replace <FILL_IN> with the appropriate code.
# We'll hold out 20% of our data for testing and leave 80% for training
seed = 42
#(split20DF, split80DF) = datasetDF.<FILL_IN>
(split20DF, split80DF) = datasetDF.randomSplit([0.2, 0.8], seed)

# Let's cache these datasets for performance
#testSetDF = <FILL_IN>
testSetDF = split20DF.cache()

#trainingSetDF = <FILL_IN>
trainingSetDF = split80DF.cache()

print(trainingSetDF)

In [18]:
# importing required libraries for linear regression

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

# Let's initialize our linear regression learner
lr = LinearRegression()

# We use explain params to dump the parameters we can use
print(lr.explainParams())

In [19]:
#we select the required column for prediction
lr.setPredictionCol("Predicted_auditRisk")\
  .setLabelCol("Audit_Risk")\
  .setMaxIter(100)\
  .setRegParam(0.1)


# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

lrPipeline.setStages([vectorizer, lr])

# Let's first train on the entire dataset to see what we get
lrModel = lrPipeline.fit(trainingSetDF)

In [20]:
# The intercept is as follows:
intercept = lrModel.stages[1].intercept

# The coefficents (i.e., weights) are as follows:
weights = lrModel.stages[1].coefficients

# Create a list of the column names (without PE)
featuresNoLabel = [col for col in datasetDF.columns if col != "Audit_Risk"]

# Merge the weights and labels
coefficents = zip(weights, featuresNoLabel)

# Now let's sort the coefficients from greatest absolute weight most to the least absolute weight


equation = "y = {intercept}".format(intercept=intercept)
variables = []
for x in coefficents:
    weight = abs(x[0])
    name = x[1]
    symbol = "+" if (x[0] > 0) else "-"
    equation += (" {} ({} * {})".format(symbol, weight, name))

# Finally here is our equation
print("Linear Regression Equation: " + equation)

In [21]:
#applying test set on the required columns

predictionsAndLabelsDF = lrModel.transform(testSetDF).select("Risk_A", "Audit_Risk", "Risk_B","Risk_c", "Predicted_auditRisk")

display(predictionsAndLabelsDF)

Risk_A,Audit_Risk,Risk_B,Risk_c,Predicted_auditRisk
0.05,0.4636,0.72,1.0,0.4453971531037886
2.076,0.7012,0.018,1.0,0.6964139093326311
0.432,0.6816,0.792,1.0,0.666713521941456
1.32,0.5552,0.056,1.0,0.542808191447676
0.0,0.28,0.0,1.0,0.2527634564636524
0.0,0.28,0.0,1.0,0.2527634564636524
0.176,0.4428,0.6,1.0,0.4247557408457872
4.32,1.1616,0.086,1.0,1.1817405034981303
2.424,2.8692,0.0,1.0,2.840356242501308
3.54,8.5056,0.0,1.0,8.417730423027514


In [22]:
# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="Predicted_auditRisk", labelCol="Audit_Risk", metricName="rmse")

# Run the evaluator on the DataFrame
rmse = regEval.evaluate(predictionsAndLabelsDF)

print("Root Mean Squared Error: %.2f" % rmse)


In [23]:
#we perform regression evaluation 

r2 = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

print("r2: {0:.2f}".format(r2))

In [24]:
# First we remove the table if it already exists
sqlContext.sql("DROP TABLE IF EXISTS auditRisk_RMSE_Evaluation")
dbutils.fs.rm("/FileStore/tables/auditRisk_RMSE_Evaluation", True)

# Next we calculate the residual error and divide it by the RMSE
predictionsAndLabelsDF.selectExpr("Audit_Risk", "Predicted_auditRisk", "Audit_Risk - Predicted_auditRisk Residual_Error", "(Audit_Risk - Predicted_auditRisk) / {} Within_RSME".format(rmse)).registerTempTable("auditRisk_RMSE_Evaluation")

In [25]:
%sql
SELECT * from auditRisk_RMSE_Evaluation

Audit_Risk,Predicted_auditRisk,Residual_Error,Within_RSME
0.4636,0.4453971531037886,0.0182028468962114,0.0835446883811026
0.7012,0.6964139093326311,0.0047860906673689,0.0219664789606212
0.6816,0.666713521941456,0.0148864780585439,0.0683237175802441
0.5552,0.542808191447676,0.012391808552324,0.0568740587604274
0.28,0.2527634564636524,0.0272365435363475,0.1250061902567607
0.28,0.2527634564636524,0.0272365435363475,0.1250061902567607
0.4428,0.4247557408457872,0.0180442591542128,0.0828168262196571
1.1616,1.1817405034981303,-0.0201405034981305,-0.0924378531657063
2.8692,2.840356242501308,0.0288437574986919,0.1323827391236176
8.5056,8.417730423027514,0.0878695769724853,0.4032905659319417


In [26]:
%sql
-- Now we can display the RMSE as a Histogram
SELECT Within_RSME  from auditRisk_RMSE_Evaluation

Within_RSME
0.0835446883811026
0.0219664789606212
0.0683237175802441
0.0568740587604274
0.1250061902567607
0.1250061902567607
0.0828168262196571
-0.0924378531657063
0.1323827391236176
0.4032905659319417


In [27]:
%sql
SELECT case when Within_RSME <= 1.0 AND Within_RSME >= -1.0 then 1
            when  Within_RSME <= 2.0 AND Within_RSME >= -2.0 then 2 else 3
       end RSME_Multiple, COUNT(*) AS count
FROM auditRisk_RMSE_Evaluation
GROUP BY case when Within_RSME <= 1.0 AND Within_RSME >= -1.0 then 1  when  Within_RSME <= 2.0 AND Within_RSME >= -2.0 then 2 else 3 end


RSME_Multiple,count
1,138
3,8
2,11


In [28]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error
# Let's create our CrossValidator with 3 fold cross validation
crossval = CrossValidator(estimator=lrPipeline, evaluator=regEval, numFolds=3)

# Let's tune over our regularization parameter from 0.01 to 0.10
regParam = [x / 100.0 for x in range(1, 11)]

# We'll create a paramter grid using the ParamGridBuilder, and add the grid to the CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, regParam)
             .build())
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
cvModel = crossval.fit(trainingSetDF).bestModel

In [29]:
# TODO: Replace <FILL_IN> with the appropriate code.
# Now let's use cvModel to compute an evaluation metric for our test dataset: testSetDF
#predictionsAndLabelsDF = <FILL_IN>
predictionsAndLabelsDF = cvModel.transform(testSetDF).select("Risk_A", "Audit_Risk", "Risk_B", 
                                                             "Risk_c","Predicted_auditRisk")

# Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame
#rmseNew = <FILL_IN>
rmseNew = regEval.evaluate(predictionsAndLabelsDF)

# Now let's compute the r2 evaluation metric for our test dataset
#r2New = <FILL_IN>
r2New = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

print("Original Root Mean Squared Error: {0:2.2f}".format(rmse))
print("New Root Mean Squared Error: {0:2.2f}".format(rmseNew))
print("Old r2: {0:2.2f}".format(r2))
print("New r2: {0:2.2f}".format(r2New))

In [30]:
print("Regularization parameter of the best model: {0:.2f}".format(cvModel.stages[-1]._java_obj.parent().getRegParam()))

In [31]:
# TODO: Replace <FILL_IN> with the appropriate code.
from pyspark.ml.regression import DecisionTreeRegressor

# Create a DecisionTreeRegressor
#dt = <FILL_IN>
#dt = DecisionTreeRegressor(maxDepth=2)
dt = DecisionTreeRegressor()

dt.setLabelCol("Audit_Risk")\
  .setPredictionCol("Predicted_auditRisk")\
  .setFeaturesCol("features")\
  .setMaxBins(100)\
  

# Create a Pipeline
#dtPipeline = <FILL_IN>
dtPipeline = Pipeline()

# Set the stages of the Pipeline
#dtPipeline.<FILL_IN>
dtPipeline.setStages([vectorizer, dt])

In [32]:
# TODO: Replace <FILL_IN> with the appropriate code.
# Let's just reuse our CrossValidator with the new dtPipeline,  RegressionEvaluator regEval, and 3 fold cross validation
crossval.setEstimator(dtPipeline)\
        .setNumFolds(3)\
        .setEvaluator(regEval) 

# Let's tune over our dt.maxDepth parameter on the values 2 and 3, create a paramter grid using the ParamGridBuilder
#paramGrid = <FILL_IN>
paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [2, 3, 4, 5]).build())

# Add the grid to the CrossValidator
#crossval.<FILL_IN>
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
#dtModel = crossval.<FILL_IN>
dtModel = crossval.fit(trainingSetDF).bestModel

In [33]:
# TODO: Replace <FILL_IN> with the appropriate code.

# Now let's use dtModel to compute an evaluation metric for our test dataset: testSetDF
#predictionsAndLabelsDF = <FILL_IN>
predictionsAndLabelsDF = dtModel.transform(testSetDF).select("Risk_A", "Audit_Risk", "Risk_B",
                                                             "Risk_C","Predicted_auditRisk")
# Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame
#rmseDT = <FILL_IN>
rmseDT = regEval.evaluate(predictionsAndLabelsDF)

# Now let's compute the r2 evaluation metric for our test dataset
#r2DT = <FILL_IN>
r2DT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})


print("LR Root Mean Squared Error: {0:.2f}".format(rmseNew))
print("DT Root Mean Squared Error: {0:.2f}".format(rmseDT))
print("LR r2: {0:.2f}".format(r2New))
print("DT r2: {0:.2f}".format(r2DT))

In [34]:
print (dtModel.stages[-1]._java_obj.toDebugString())

In [35]:
from pyspark.ml.regression import RandomForestRegressor

# Create a RandomForestRegressor
#rf = <FILL_IN>
rf = RandomForestRegressor(numTrees = 30, maxDepth = 8, seed = 42)

rf.setLabelCol("Audit_Risk")\
  .setPredictionCol("Predicted_auditRisk")\
  .setFeaturesCol("features")\
  .setSeed(42)\
  .setMaxDepth(8)\
  .setNumTrees(30)

# Create a Pipeline
#rfPipeline = <FILL_IN>
rfPipeline = Pipeline()


# Set the stages of the Pipeline
#rfPipeline.<FILL_IN>
rfPipeline.setStages([vectorizer, rf])

In [36]:
# TODO: Replace <FILL_IN> with the appropriate code.
# Let's just reuse our CrossValidator with the new rfPipeline,  RegressionEvaluator regEval, and 3 fold cross validation
#crossval.setEstimator(rfPipeline)
crossval.setNumFolds(3)\
        .setEstimator(rfPipeline)\
        .setEvaluator(regEval)

# Let's tune over our rf.maxBins parameter on the values 50 and 100, create a parameter grid using the ParamGridBuilder
#paramGrid = <FILL_IN>
paramGrid = (ParamGridBuilder().addGrid(rf.maxBins, [50, 75, 100, 250]).build())

# Add the grid to the CrossValidator
#crossval.<FILL_IN>
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
#rfModel = <FILL_IN>
rfModel = crossval.fit(trainingSetDF).bestModel

In [37]:
# TODO: Replace <FILL_IN> with the appropriate code.

# Now let's use rfModel to compute an evaluation metric for our test dataset: testSetDF
#predictionsAndLabelsDF = <FILL_IN>
predictionsAndLabelsDF = rfModel.transform(testSetDF)

# Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame
#rmseRF = <FILL_IN>
rmseRF = regEval.evaluate(predictionsAndLabelsDF)

# Now let's compute the r2 evaluation metric for our test dataset
#r2RF = <FILL_IN>
r2RF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

print("LR Root Mean Squared Error: {0:.2f}".format(rmseNew))
print("DT Root Mean Squared Error: {0:.2f}".format(rmseDT))
print("RF Root Mean Squared Error: {0:.2f}".format(rmseRF))
print("LR r2: {0:.2f}".format(r2New))
print("DT r2: {0:.2f}".format(r2DT))
print("RF r2: {0:.2f}".format(r2RF))

In [38]:

display(predictionsAndLabelsDF)

Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B6,Risk_B,TOTAL,numbers,Score_B10,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB16,RiSk_E,History,Prob19,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk,features,Predicted_auditRisk
1.85,13,0.25,0.2,0.05,1.8,0.4,0.72,2.05,5.0,0.2,1.0,0.74,0.2,0.148,2,0.2,0.4,0,0.2,0.0,2.6,2.318,0.4,0.5,0.4636,0,"List(1, 4, List(), List(0.4636, 0.05, 0.72, 1.0))",0.6353713348729897
1.85,13,3.46,0.6,2.076,0.09,0.2,0.018,3.55,5.0,0.2,1.0,0.06,0.2,0.012,2,0.2,0.4,0,0.2,0.0,2.4,3.506,0.4,0.5,0.7012,0,"List(1, 4, List(), List(0.7012, 2.076, 0.018, 1.0))",0.7632882392482895
1.85,14,1.08,0.4,0.432,1.98,0.4,0.792,3.06,5.0,0.2,1.0,3.92,0.2,0.784,2,0.2,0.4,0,0.2,0.0,2.8,3.408,0.4,0.5,0.6816,0,"List(1, 4, List(), List(0.6816, 0.432, 0.792, 1.0))",0.7253156247966659
1.85,14,2.2,0.6,1.32,0.28,0.2,0.056,2.48,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.4,2.776,0.4,0.5,0.5552,0,"List(1, 4, List(), List(0.5552, 1.32, 0.056, 1.0))",0.6049499057568755
1.85,16,0.0,0.2,0.0,0.0,0.2,0.0,0.0,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.4,0.4,0.5,0.28,0,"List(1, 4, List(), List(0.28, 0.0, 0.0, 1.0))",0.3606773468947132
1.85,16,0.0,0.2,0.0,0.0,0.2,0.0,0.0,5.0,0.2,1.0,0.0,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.4,0.4,0.5,0.28,0,"List(1, 4, List(), List(0.28, 0.0, 0.0, 1.0))",0.3606773468947132
1.85,16,0.88,0.2,0.176,1.5,0.4,0.6,2.38,5.0,0.2,1.0,0.19,0.2,0.038,2,0.2,0.4,0,0.2,0.0,2.6,2.214,0.4,0.5,0.4428,0,"List(1, 4, List(), List(0.4428, 0.176, 0.6, 1.0))",0.4703320252740693
1.85,18,7.2,0.6,4.32,0.43,0.2,0.086,7.63,5.0,0.2,1.0,0.01,0.2,0.002,2,0.2,0.4,0,0.2,0.0,2.4,5.808,0.4,0.5,1.1616,1,"List(1, 4, List(), List(1.1616, 4.32, 0.086, 1.0))",1.3230026369668515
1.85,19,4.04,0.6,2.424,0.0,0.2,0.0,4.04,5.0,0.2,1.0,0.79,0.2,0.158,6,0.2,1.2,0,0.2,0.0,2.8,4.782,1.2,0.5,2.8692,1,"List(1, 4, List(), List(2.8692, 2.424, 0.0, 1.0))",2.696456254118124
1.85,19,5.9,0.6,3.54,0.0,0.2,0.0,5.9,5.0,0.2,1.0,0.74,0.2,0.148,6,0.4,2.4,0,0.2,0.0,3.0,7.088,2.4,0.5,8.5056,1,"List(1, 4, List(), List(8.5056, 3.54, 0.0, 1.0))",7.733783410743042


In [39]:
print (rfModel.stages[-1]._java_obj.toDebugString())