# **Machine Learning Implentation:**

**Installing Spark Libraries**

In [None]:
pip install pyspark
pip install bloom-filter2

** Import required libraries for the project**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, udf, concat_ws, concat, to_date, collect_list, translate, regexp_replace, when
from pyspark.sql.types import BooleanType, StringType
from bloom_filter2 import BloomFilter
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import numpy as np

**Instantiate a Spark Session**

In [None]:
spark = SparkSession.builder.appName('SentimentAnalyzer').getOrCreate()

22/12/22 17:04:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/22 17:04:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/12/22 17:04:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


**Loading Reddit Data**

In [None]:
# Load data and rename column
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("multiLine", "true")\
    .option("delimiter", "¥")\
    .load("data/reddit-data-cleaned.csv")\
    .coalesce(5)

                                                                                

**Preprocessing data**

In [None]:
df = df.withColumn('comment', lower(col('comment')))

In [None]:
# filter to see if title column contains any keyword from keywords
keywords = ["SP500" , "S&P500"]
def my_filter(col):
    for keyword in keywords:
        if keyword.lower() in col.lower():
            return True
    return False

filterUDF = udf(my_filter, BooleanType())
ids = df.filter(col("title").isNotNull()).filter(filterUDF('title')).select("ID")

In [None]:
# create and populate bloom filter
bloomFilterIDS = BloomFilter(ids.count(), 0.000000001)
collected_ids = ids.collect()
for row in collected_ids:
    bloomFilterIDS.add(row["ID"])

                                                                                

In [None]:
broadcastFilterIds = spark.sparkContext.broadcast(bloomFilterIDS)

In [None]:
def my_filter_by_ids(col):
    return col in broadcastFilterIds.value
        
filterIdUDF = udf(my_filter_by_ids, BooleanType())
bloomedFilteredData = df.filter(col("SP500").isNotNull()).filter(filterIdUDF('ID'))

In [None]:
bloomedFilteredData = bloomedFilteredData.withColumn("date_stock",to_date("timestamp"))

In [None]:
bloomedFilteredData = bloomedFilteredData.na.drop(subset=["comment"])


In [None]:
bloomedFilteredData= bloomedFilteredData.drop("_c0","id","title", "timestamp", "time_key", "TESLA")


In [None]:
df1 = bloomedFilteredData.groupby('date_stock', 'SP500').agg(collect_list('comment').alias("comment"))


In [None]:
df2 = df1.withColumn("comment",
   concat_ws(",",col("comment")))



In [None]:
df2 = df2.withColumn('comment', translate('comment', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', '" '))
df2 = df2.withColumn('comment', regexp_replace('comment', '"', ' '))
df2 = df2.withColumn('comment', regexp_replace('comment', "'", ' '))


In [None]:
df2.filter(df2.date_stock == "2022-05-04") \
    .show(truncate=False)

                                                                                

+----------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date_stock|SP500              |comment                                                                                                                                                                                                                                                                                           |
+----------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2022-05-04|0.02986242108440

In [None]:
df2= df2.withColumn("SP500", when(df2["SP500"]>0,1).otherwise(0))

In [None]:
df2= df2.withColumnRenamed("SP500","label")

**Spark ML pipeline setup**

In [None]:
stages = []

regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

swr = StopWordsRemover(inputCol="tokens", outputCol="Comments")
stages += [swr]

cv = CountVectorizer(inputCol="Comments", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]


vecAssembler = VectorAssembler(inputCols=['token_features'], outputCol="features")
stages += [vecAssembler]

[print('\n', stage) for stage in stages]




 RegexTokenizer_88344dd24e65

 StopWordsRemover_aed35d0842b3

 CountVectorizer_8fb9d450a06a

 VectorAssembler_de7bb12b9159


[None, None, None, None]

**Training and testing models**

##Pipeline Fitting:

In [None]:
pipeline = Pipeline(stages=stages)
data = pipeline.fit(df2).transform(df2)

                                                                                

In [None]:
train, test = data.randomSplit([0.7, 0.3])

## Naive Bayes Implementation

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(train)

                                                                                

In [None]:
predictions = model.transform(test)
# Select results to view
predictions.limit(20).select("label", "prediction", "probability").show(truncate=False)



+-----+----------+------------------------------------------+
|label|prediction|probability                               |
+-----+----------+------------------------------------------+
|1    |1.0       |[2.3070935099073077E-9,0.9999999976929066]|
|0    |1.0       |[0.41053334654497303,0.589466653455027]   |
|0    |0.0       |[0.9997232192953159,2.7678070468406627E-4]|
|0    |0.0       |[0.9999215716457134,7.842835428658029E-5] |
|0    |0.0       |[0.9999992813957496,7.186042504598525E-7] |
|0    |1.0       |[9.60944879558698E-10,0.9999999990390551] |
|0    |1.0       |[0.41053334654497303,0.589466653455027]   |
+-----+----------+------------------------------------------+



                                                                                

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
nbaccuracy = evaluator.evaluate(predictions)
print ("Test Area Under ROC: ", nbaccuracy)

                                                                                

Test Area Under ROC:  0.75


**Cross Validation Evaluation for Naive Bayes Model:**

In [None]:


# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)


22/12/22 17:20:05 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/22 17:20:05 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

0.8333333333333334

In [None]:

# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
print ("Test Area Under ROC: ", evaluator.evaluate(cvPredictions))

[Stage 492:>                                                        (0 + 1) / 1]

Test Area Under ROC:  0.8333333333333334


                                                                                

## Logistic regression Model:

In [None]:
log_reg = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
model2 = log_reg.fit(train)
predictions = model2.transform(test)

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName('areaUnderROC')
lgaccuracy = evaluator.evaluate(predictions)
print(lgaccuracy)


                                                                                

0.5


**Cross Validation Evaluation for logistic Rergression Model**

In [None]:
# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=log_reg, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

                                                                                

0.5

## Random Forest Classifier Model:

In [None]:
rf = RandomForestClassifier().setLabelCol('label').setFeaturesCol('features').setNumTrees(10)
model = rf.fit(train)
predictions = model.transform(test)

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName("areaUnderROC")
rfaccuracy = evaluator.evaluate(predictions)
print(rfaccuracy)

22/12/22 17:25:03 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 22 (= number of training instances)
[Stage 1972:>                                                       (0 + 1) / 1]

0.5833333333333333


                                                                                

**Cross Validation Evaluation for Randon Forest Classifier Model**

In [None]:
# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

22/12/22 17:25:25 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:37 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:42 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:46 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:51 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:55 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:25:59 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:26:03 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:26:15 WARN D

0.5833333333333333

## Decision Tree Classifier Model:

In [None]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)

evaluator = BinaryClassificationEvaluator().setRawPredictionCol('prediction')
#evaluator = BinaryClassificationEvaluator(labelCol="label", featuresCol="features", maxDepth=2)
dtAccuracy = evaluator.evaluate(predictions)
print(dtAccuracy) 

22/12/22 17:28:39 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 22 (= number of training instances)
                                                                                

1.0


**Cross Validation Evaluation for Decision Tree Clasifier**

In [None]:
# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

22/12/22 17:29:00 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:13 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:18 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:22 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:26 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:31 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:37 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:39 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 13 (= number of training instances)
22/12/22 17:29:50 WARN D

1.0

## Support Vector Classifier Model:

In [None]:
# Define your classifier
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(train)

# Compute predictions for test data
predictions = lsvcModel.transform(test)

# Define the evaluator method with the corresponding metric and compute the classification error on test data
evaluator = BinaryClassificationEvaluator().setRawPredictionCol('prediction')
svmaccuracy = evaluator.evaluate(predictions) 

# Show the accuracy
print("Test accuracy = %g" % (svmaccuracy))

[Stage 4252:>                                                       (0 + 1) / 1]

Test accuracy = 0.75


                                                                                

**Cross Validation Evaluation of Support Vector Classifier**

In [None]:
# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=lsvc, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(test)
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

                                                                                

0.75