1. Reading the data from the Sqlite file

In [1]:
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.ml.classification.{ RandomForestClassifier, LogisticRegression}
var reader=spark.read
val schema = new StructType(Array(new StructField("score", StringType, true),new StructField("summary", StringType, true)))
reader.option("inferSchema",true).option("header",true).option("delimiter",",")
var data=reader.csv("./reviews_500datast.csv")
var reviews=data.select("score","summary").na.drop()
var original = reviews.na.drop()
original.show()

+-----+--------------------+
|score|             summary|
+-----+--------------------+
|    5|Good Quality Dog ...|
|    1|   Not as Advertised|
|    4|"""Delight"" says...|
|    2|      Cough Medicine|
|    5|         Great taffy|
|    4|          Nice Taffy|
|    5|Great!  Just as g...|
|    5|Wonderful, tasty ...|
|    5|          Yay Barley|
|    5|    Healthy Dog Food|
|    5|The Best Hot Sauc...|
|    5|"My cats LOVE thi...|
|    1|My Cats Are Not F...|
|    4|   fresh and greasy!|
|    5|Strawberry Twizzl...|
|    5|Lots of twizzlers...|
|    2|          poor taste|
|    5|            Love it!|
|    5|  GREAT SWEET CANDY!|
|    5|Home delivered tw...|
+-----+--------------------+
only showing top 20 rows



2. Encoding score to Positive or negative based on value of each sample

In [2]:
import org.apache.spark.sql.functions._
var scores = original.select("score")
val reviewsConverted = original.withColumn("score", when(col("score") >=3, "Positive").otherwise("Negative"))
reviewsConverted.show()
reviewsConverted.schema

+--------+--------------------+
|   score|             summary|
+--------+--------------------+
|Positive|Good Quality Dog ...|
|Negative|   Not as Advertised|
|Positive|"""Delight"" says...|
|Negative|      Cough Medicine|
|Positive|         Great taffy|
|Positive|          Nice Taffy|
|Positive|Great!  Just as g...|
|Positive|Wonderful, tasty ...|
|Positive|          Yay Barley|
|Positive|    Healthy Dog Food|
|Positive|The Best Hot Sauc...|
|Positive|"My cats LOVE thi...|
|Negative|My Cats Are Not F...|
|Positive|   fresh and greasy!|
|Positive|Strawberry Twizzl...|
|Positive|Lots of twizzlers...|
|Negative|          poor taste|
|Positive|            Love it!|
|Positive|  GREAT SWEET CANDY!|
|Positive|Home delivered tw...|
+--------+--------------------+
only showing top 20 rows



StructType(StructField(score,StringType,false), StructField(summary,StringType,true))

In [13]:
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
val regexTokenizer = new RegexTokenizer().setInputCol("summary").setOutputCol("words").setPattern("\\w+").setGaps(false)
val countTokens = udf { (words: Seq[String]) => words.length }
val regexTokenized = regexTokenizer.transform(reviewsConverted)
regexTokenized.select("summary", "words").withColumn("tokens", countTokens(col("words"))).show(5)


+--------------------+--------------------+------+
|             summary|               words|tokens|
+--------------------+--------------------+------+
|Good Quality Dog ...|[good, quality, d...|     4|
|   Not as Advertised|[not, as, adverti...|     3|
|"""Delight"" says...|[delight, says, i...|     4|
|      Cough Medicine|   [cough, medicine]|     2|
|         Great taffy|      [great, taffy]|     2|
+--------------------+--------------------+------+
only showing top 5 rows



In [14]:
import org.apache.spark.ml.feature.StopWordsRemover
val remover = new StopWordsRemover().setInputCol("words").setOutputCol("filtered")
val filteredWords = remover.transform(regexTokenized)
filteredWords.show(5)

+--------+--------------------+--------------------+--------------------+
|   score|             summary|               words|            filtered|
+--------+--------------------+--------------------+--------------------+
|Positive|Good Quality Dog ...|[good, quality, d...|[good, quality, d...|
|Negative|   Not as Advertised|[not, as, adverti...|        [advertised]|
|Positive|"""Delight"" says...|[delight, says, i...|     [delight, says]|
|Negative|      Cough Medicine|   [cough, medicine]|   [cough, medicine]|
|Positive|         Great taffy|      [great, taffy]|      [great, taffy]|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}

//val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(3).setMinDF(2).fit(tokenized)
val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("filtered").setOutputCol("vectors").fit(filteredWords)

val vectorizedWords = cvModel.transform(filteredWords)
vectorizedWords.show(10)

+--------+--------------------+--------------------+--------------------+--------------------+
|   score|             summary|               words|            filtered|             vectors|
+--------+--------------------+--------------------+--------------------+--------------------+
|Positive|Good Quality Dog ...|[good, quality, d...|[good, quality, d...|(641,[1,5,11,46],...|
|Negative|   Not as Advertised|[not, as, adverti...|        [advertised]|   (641,[582],[1.0])|
|Positive|"""Delight"" says...|[delight, says, i...|     [delight, says]|(641,[348,410],[1...|
|Negative|      Cough Medicine|   [cough, medicine]|   [cough, medicine]|(641,[411,535],[1...|
|Positive|         Great taffy|      [great, taffy]|      [great, taffy]|(641,[0,93],[1.0,...|
|Positive|          Nice Taffy|       [nice, taffy]|       [nice, taffy]|(641,[34,93],[1.0...|
|Positive|Great!  Just as g...|[great, just, as,...|[great, good, exp...|(641,[0,1,65,179]...|
|Positive|Wonderful, tasty ...|[wonderful, tasty..

In [16]:
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors

val pca = new PCA().setInputCol("vectors").setOutputCol("pcaFeatures").setK(300).fit(vectorizedWords)

val pcData = pca.transform(vectorizedWords)
pcData.show(false)

+--------+----------------------------------------------------------------+----------------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

|Positive|"My cats LOVE this ""diet"" food better than their regular food"|[my, cats, love, this, diet, food, better, than, their, regular, food]|[cats, love, diet, food, better, regular, food]|(641,[3,5,23,29,114,606],[1.0,2.0,1.0,1.0,1.0,1.0])|[-0.28945054738750253,-0.11873946086641048,-1.2179167494607146,0.8624709808081447,1.6861601589243076,0.2141406158385581,-0.23967380828401436,0.3847337918718802,0.06337324648798419,0.04200323261342457,-0.020203457867557933,0.22607502959847892,-0.01902955730991124,-0.08297528789691577,0.18794040900237846,0.566939398240652,0.7478841745874838,0.30688304570744523,0.1665204149825885,-0.4793911513898155,0.0937093364842057,0.5123994805236736,0.3348496733171733,0.14143332954453355,-0.03936855030065368,-0.04768933293587316,0.444281226916415,-0.13347246841942473,0.004481524933540452,-0.1790620865457547,0.06427838055635232,-0.11879994639536344,-0.08100931840804662,0.12116270287724265,0.04955198217167228,0.23505403077950207,-0.030342520536685055,-0.02736242

|Positive|Love it!                                                        |[love, it]                                                            |[love]                                         |(641,[3],[1.0])                                    |[0.005538681074257154,-0.17535469066469808,-0.792551146649803,0.4776798729661286,-0.1967232949803962,-0.1782561060976896,-0.09415035532288167,-0.021658212141355465,0.03353515160449616,-0.004212862839939227,0.025240855135139184,0.027719590918712497,0.021630248080164005,-0.032234866408234426,0.0033779243794753444,-0.03845498866857419,0.014351670131380279,-0.01847049316963567,0.019170319546860548,0.0041071163654670395,-0.01480312809225176,0.002660911197977598,-0.001815015320652244,-0.03860095493220228,0.025179383483693953,-0.023216602688502203,-0.01293221191728372,0.004788454962764026,-0.009745454090746642,0.015785288243749928,0.0013687615327194202,0.010140483866920798,-0.005517715798697823,-0.007914171665907337,0.0021007358970043175,0.00588514360

|Positive|GREAT SWEET CANDY!                                              |[great, sweet, candy]                                                 |[great, sweet, candy]                          |(641,[0,30,494],[1.0,1.0,1.0])                     |[-0.9522079380552179,-0.03204233896653603,0.1452468695425291,0.11967445675835166,-0.15602493175464238,0.027383946299913444,0.05494487312748556,-0.03766062025889535,0.005078315627868076,-0.02717891069872546,-0.03187960704681657,-0.05140075313708286,0.025124009361600537,-0.18826202745044185,-0.0404530431266253,-0.06021551235366354,0.031584025510232976,-0.29089437813866936,0.19264194170066282,0.08330315245887296,0.01924005857649653,0.4869186115073804,-0.18742576750794307,-0.1440128433735772,-0.2868132342805066,0.06372155847472785,-0.06008904306768879,-0.08792164487396764,-0.03166953050458277,-0.010743905281458717,2.93123809474289E-4,0.04228238502235374,-0.02602356103795296,0.09755984703308138,-0.012129328104611798,0.04495707175318347,-0.0709242948

In [17]:
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.LogisticRegression
//val indexer1 = new StringIndexer().setInputCol("summary").setOutputCol("summaryIndex").fit(reviewsConverted).transform(reviewsConverted)
//val indexer2 = new StringIndexer().setInputCol("score").setOutputCol("scoreIndex").fit(pcData).transform(pcData)
val indexer2 = new StringIndexer().setInputCol("score").setOutputCol("scoreIndex").fit(vectorizedWords).transform(vectorizedWords)
val formula = new RFormula().setFormula("scoreIndex ~ vectors") 
val fittedRF = formula.fit(indexer2)
val preparedDF = fittedRF.transform(indexer2)

In [18]:
val Array(train, test) = preparedDF.randomSplit(Array(0.8, 0.2)) 

In [19]:
val logisticRegression = new LogisticRegression()
val logisticRegressionModel = logisticRegression.fit(train)
logisticRegressionModel.evaluate(test)
val predictions = logisticRegressionModel.transform(test)

In [20]:
print(predictions.count)
predictions.show(10)

99+--------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-----+--------------------+--------------------+----------+
|   score|             summary|               words|            filtered|             vectors|scoreIndex|            features|label|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-----+--------------------+--------------------+----------+
|Negative|               AWFUL|             [awful]|             [awful]|    (641,[74],[1.0])|       1.0|    (641,[74],[1.0])|  1.0|[-45.581613650186...|[1.60013533444095...|       1.0|
|Negative|      Altoids Smalls|   [altoids, smalls]|   [altoids, smalls]|(641,[59,119],[1....|       1.0|(641,[59,119],[1....|  1.0|[29.1562968290404...|[0.99999999999978...|       0.0|
|Negative|Cat won't go near...|[cat, won, t, go,...|[cat, won, go, n

In [21]:
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor

val dt = new DecisionTreeRegressor().setLabelCol("scoreIndex").setFeaturesCol("vectors")

val model = dt.fit(train)

val predictionsDecision = model.transform(test)

predictionsDecision.show

+--------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-----+-------------------+
|   score|             summary|               words|            filtered|             vectors|scoreIndex|            features|label|         prediction|
+--------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-----+-------------------+
|Negative|               AWFUL|             [awful]|             [awful]|    (641,[74],[1.0])|       1.0|    (641,[74],[1.0])|  1.0|                1.0|
|Negative|      Altoids Smalls|   [altoids, smalls]|   [altoids, smalls]|(641,[59,119],[1....|       1.0|(641,[59,119],[1....|  1.0|0.07969151670951156|
|Negative|Cat won't go near...|[cat, won, t, go,...|[cat, won, go, near]|(641,[40,97,117,3...|       1.0|(641,[40,97,117,3...|  1.0|0.07969151670951156|
|Negative|        Disappointed|      [disappointed]|      [disappointed]|    (641,

In [22]:
val labelPositives = predictions.where(expr("label == 1.0"))
val labelNegatives = predictions.where(expr("label == 0.0"))


val falseNegatives = labelPositives.where(expr("label != prediction")).count()
val falsePositives = labelNegatives.where(expr("label != prediction")).count()
val trueNegatives  = labelNegatives.where(expr("label == prediction")).count()
val truePositives  = labelPositives.where(expr("label == prediction")).count()

val falsePositivePercentage = (falsePositives.toFloat/(falsePositives + trueNegatives))
val falseNegativePercentage = (falseNegatives.toFloat/(falseNegatives + truePositives))

println(" False Positive Percentage = " + falsePositivePercentage)
println(" False Negative Percentage = " + falseNegativePercentage)

println(labelPositives.count())
println(labelNegatives.count())
println(labelNegatives.where(expr("prediction == 0.0")).count())
println(labelPositives.where(expr("prediction == 0.0")).count())
val n=(truePositives+trueNegatives)
val d=(falsePositives + trueNegatives+falseNegatives + truePositives)
println(n.toFloat/ d)
println((truePositives+trueNegatives).toFloat/(falsePositives + trueNegatives+falseNegatives + truePositives))
println(trueNegatives)
    println(truePositives)

 False Positive Percentage = 0.08139535
 False Negative Percentage = 0.6923077
13
86
79
9
0.83838385
0.83838385
79
4
