<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [8]:
training = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

In [9]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [10]:
lrModel = lr.fit(training)

In [12]:
print("Coefficients : " + str(lrModel.coefficients))
print("Intercept : " + str(lrModel.intercept))

Coefficients : (692,[244,263,272,300,301,328,350,351,378,379,405,406,407,428,433,434,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.353983524188197e-05,-9.102738505589466e-05,-0.00019467430546904298,-0.00020300642473486668,-3.1476183314863995e-05,-6.842977602660743e-05,1.5883626898239883e-05,1.4023497091372047e-05,0.00035432047524968605,0.00011443272898171087,0.00010016712383666666,0.0006014109303795481,0.0002840248179122762,-0.00011541084736508837,0.000385996886312906,0.000635019557424107,-0.00011506412384575676,-0.00015271865864986808,0.0002804933808994214,0.0006070117471191634,-0.0002008459663247437,-0.0001421075579290126,0.0002739010341160883,0.00027730456244968115,-9.838027027269332e-05,-0.0003808522443517704,-0.00025315198008555033,0.00027747714770754307,-0.0002443619763919199,-0.0015394744687597765,-0.00023073328411331293])
Intercept : 0.22456315961250325


In [13]:
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

In [14]:
mlrModel = mlr.fit(training)

In [16]:
print("Multinomial Coefficients : " + str(mlrModel.coefficientMatrix))
print("MultinomialIntercept : " + str(mlrModel.interceptVector))

Multinomial Coefficients : 2 X 692 CSRMatrix
(0,244) 0.0
(0,263) 0.0001
(0,272) 0.0001
(0,300) 0.0001
(0,350) -0.0
(0,351) -0.0
(0,378) -0.0
(0,379) -0.0
(0,405) -0.0
(0,406) -0.0006
(0,407) -0.0001
(0,428) 0.0001
(0,433) -0.0
(0,434) -0.0007
(0,455) 0.0001
(0,456) 0.0001
..
..
MultinomialIntercept : [-0.12065879445860686,0.12065879445860686]


Naive Bayes

In [17]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
data = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

In [19]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [20]:
train.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[100,101,102...|
|  0.0|(692,[121,122,123...|
|  0.0|(692,[122,123,148...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[123,124,125...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[129,130,131...|
|  0.0|(692,[151,152,153...|
|  0.0|(692,[153,154,155...|
+-----+--------------------+
only showing top 20 rows



In [21]:
test.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[95,96,97,12...|
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[122,123,124...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[125,126,127...|
|  0.0|(692,[126,127,128...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[127,128,129...|
|  0.0|(692,[128,129,130...|
|  0.0|(692,[150,151,152...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[152,153,154...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[181,182,183...|
|  1.0|(692,[100,101,102...|
+-----+--------------------+
only showing top 20 rows



In [22]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [23]:
model = nb.fit(training)

In [24]:
predictions = model.transform(test)
predictions.show()

+-----+--------------------+--------------------+-----------+----------+
|label|            features|       rawPrediction|probability|prediction|
+-----+--------------------+--------------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|[-170882.07872604...|  [1.0,0.0]|       0.0|
|  0.0|(692,[98,99,100,1...|[-174137.32798149...|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|[-189356.05000199...|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[-268952.75343475...|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[-182687.22825681...|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|[-256997.16388524...|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|[-209239.48362082...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-169859.06591059...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-211583.90465150...|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|[-182558.98726303...|  [1.0,0.0]|       0.0|
|  0.0|(692,[128,129,130...|[-245007.81758450...|  

In [25]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

In [26]:
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 1.0


In [27]:
spark.stop()