# Detection Of Cyberbullying on Reddit Comments

## Running prerequisite code for connecting to Google Drive, setting up environment variables and initializing Spark

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark  
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

Mounted at /content/gdrive


Specifying the path of reddit comments and abusive word list dataset

In [None]:
path = '/content/gdrive/MyDrive/Set_50K.csv'

words = '/content/gdrive/MyDrive/list.csv'

Storing the reddit comments in a pyspark dataframe and the abusive words in a RDD

In [None]:
df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(path)
wordsRDD = sc.textFile(words)

In [None]:
df.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: string (nullable = true)
 |-- score: string (nullable = true)



Adding row numbers to the dataframe

In [None]:
from pyspark.sql.functions import monotonically_increasing_id
df = df.withColumn("id",monotonically_increasing_id())
df=df.na.drop()


In [None]:
df.show(5)

+-----------+--------------------+----------------+-----+---+
|  subreddit|                body|controversiality|score| id|
+-----------+--------------------+----------------+-----+---+
|*I am a bot| and this action ...|               0|    1|  1|
|        aww|Dont squeeze her ...|               0|   19|  2|
|     gaming|It's pretty well ...|               0|    3|  3|
|       news|You know we have ...|               0|   10|  4|
|   politics|Yes, there is a d...|               0|    1|  5|
+-----------+--------------------+----------------+-----+---+
only showing top 5 rows



In [None]:
wordsRDD.take(5)

['69', '@55', '@ssfcker', '@ssfucker', '@ssfvcker']

In [None]:
words = wordsRDD.collect()

# Data Preprocessing
## I. Tokenization
1) Converting comments to lower case

In [None]:
from pyspark.sql.functions import lower, col
df1 = df.withColumn("lower_body",lower(col("body"))).select("subreddit","body","lower_body")
df1=df1.na.drop()
df1.show()

+--------------------+--------------------+--------------------+
|           subreddit|                body|          lower_body|
+--------------------+--------------------+--------------------+
|         *I am a bot| and this action ...| and this action ...|
|                 aww|Dont squeeze her ...|dont squeeze her ...|
|              gaming|It's pretty well ...|it's pretty well ...|
|                news|You know we have ...|you know we have ...|
|            politics|Yes, there is a d...|yes, there is a d...|
|           dankmemes|Please let this b...|please let this b...|
| relationship_advice|I would be less w...|i would be less w...|
|                 nba|REPORT: Water is ...|report: water is ...|
|           worldnews|How many millions...|how many millions...|
|                 aww|What an amazing t...|what an amazing t...|
|           AskReddit|Like a giant turd...|like a giant turd...|
|              gaming|Why would we want...|why would we want...|
|                 nba|*mi

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

2) Splitting the comments into a list of words

In [None]:
from pyspark.sql.functions import split
df1 = df1.withColumn("split", split("lower_body", " ")).select("subreddit","body","lower_body","split")
df1=df1.na.drop()
df1.show(5)

+-----------+--------------------+--------------------+--------------------+
|  subreddit|                body|          lower_body|               split|
+-----------+--------------------+--------------------+--------------------+
|*I am a bot| and this action ...| and this action ...|[, and, this, act...|
|        aww|Dont squeeze her ...|dont squeeze her ...|[dont, squeeze, h...|
|     gaming|It's pretty well ...|it's pretty well ...|[it's, pretty, we...|
|       news|You know we have ...|you know we have ...|[you, know, we, h...|
|   politics|Yes, there is a d...|yes, there is a d...|[yes,, there, is,...|
+-----------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
body_none = df1.select('split').rdd.flatMap(list)
body = body_none.map(lambda y: y if y is not None else '')

## II. Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
body_lem = body.map(lambda x: [ lemmatizer.lemmatize(i) for i in x])


## III. Stop-Words Removal

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
body_sr = body_lem.map(lambda x: [i for i in x if i not in stop])
body_sr.take(2)

[['',
  'action',
  'wa',
  'performed',
  'automatically.',
  'please',
  '[contact',
  'moderator',
  'subreddit](/message/compose/?to=/r/gameofthrones)',
  'question',
  'concerns.*"'],
 ['dont', 'squeeze', 'massive', 'hand,', 'mean', 'giant.']]

Converting RDDs to dataframe and merging them with the original dataframe

In [None]:
from pyspark.sql.types import *
body_sr_collect = spark.createDataFrame(
    body_sr.zipWithIndex(),
    StructType([
        StructField("words", ArrayType(StringType())),
        StructField("id_words", LongType())
    ])
)

In [None]:
body_sr_collect.show(5)

+--------------------+--------+
|               words|id_words|
+--------------------+--------+
|[, action, wa, pe...|       0|
|[dont, squeeze, m...|       1|
|[pretty, well, kn...|       2|
|[know, law, curre...|       3|
|[yes,, difference...|       4|
+--------------------+--------+
only showing top 5 rows



In [None]:
xx = df.join(body_sr_collect,body_sr_collect.id_words == df.id,"inner")
xx=xx.drop("id")

In [None]:
xx.show(5)

+-----------+--------------------+----------------+-----+--------------------+--------+
|  subreddit|                body|controversiality|score|               words|id_words|
+-----------+--------------------+----------------+-----+--------------------+--------+
|*I am a bot| and this action ...|               0|    1|[dont, squeeze, m...|       1|
|        aww|Dont squeeze her ...|               0|   19|[pretty, well, kn...|       2|
|     gaming|It's pretty well ...|               0|    3|[know, law, curre...|       3|
|       news|You know we have ...|               0|   10|[yes,, difference...|       4|
|   politics|Yes, there is a d...|               0|    1|[please, let, bec...|       5|
+-----------+--------------------+----------------+-----+--------------------+--------+
only showing top 5 rows



Counting the number of abusive words in  a comment and counting the total number of words in a comment after stop-words removal

In [None]:
bully = body_sr.map(lambda y: y if y is not None else '')
tot_count = bully.map(lambda x: len(x))
bully2 = bully.map(lambda y: [x for x in y if x in words])

In [None]:
bully2.take(5)

[[], [], [], [], []]

In [None]:
from pyspark.sql.types import *
bully_word_collect = spark.createDataFrame(
    bully2.zipWithIndex(),
    StructType([
        StructField("BullyWords",ArrayType(StringType())),
        StructField("id_bull", LongType())
    ])
)
bully_word_collect.show(5)

+----------+-------+
|BullyWords|id_bull|
+----------+-------+
|        []|      0|
|        []|      1|
|        []|      2|
|        []|      3|
|        []|      4|
+----------+-------+
only showing top 5 rows



In [None]:
bully_count = bully2.map(lambda x: len(x))

In [None]:
bully_count.take(5)

[0, 0, 0, 0, 0]

In [None]:
from pyspark.sql.types import *
bully_count_collect = spark.createDataFrame(
    bully_count.zipWithIndex(),
    StructType([
        StructField("Count", LongType()),
        StructField("id_cnt", LongType())
    ])
)

In [None]:
tot_count.take(5)

[11, 6, 43, 9, 10]

In [None]:
from pyspark.sql.types import *
tot_count_collect = spark.createDataFrame(
    tot_count.zipWithIndex(),
    StructType([
        StructField("Total_Count", LongType()),
        StructField("id_tot", LongType())
    ])
)

In [None]:
jj = bully_count_collect.join(bully_word_collect,bully_count_collect.id_cnt == bully_word_collect.id_bull,"inner")
jj.show(5)

+-----+------+----------+-------+
|Count|id_cnt|BullyWords|id_bull|
+-----+------+----------+-------+
|    0|    26|        []|     26|
|    0|    29|        []|     29|
|    0|   474|        []|    474|
|    0|   964|        []|    964|
|    0|  1677|        []|   1677|
+-----+------+----------+-------+
only showing top 5 rows



In [None]:
va = jj.join(tot_count_collect,tot_count_collect.id_tot == jj.id_cnt,"inner")
va.show(5)

+-----+------+----------+-------+-----------+------+
|Count|id_cnt|BullyWords|id_bull|Total_Count|id_tot|
+-----+------+----------+-------+-----------+------+
|    0|    26|        []|     26|         10|    26|
|    0|    29|        []|     29|          3|    29|
|    0|   474|        []|    474|          8|   474|
|    0|   964|        []|    964|          8|   964|
|    0|  1677|        []|   1677|          6|  1677|
+-----+------+----------+-------+-----------+------+
only showing top 5 rows



In [None]:
yy = xx.join(va,xx.id_words == va.id_tot,"inner")
yy.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+------+----------+-------+-----------+------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|id_cnt|BullyWords|id_bull|Total_Count|id_tot|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+------+----------+-------+-----------+------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|    29|        []|     29|          3|    29|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|   474|        []|    474|          8|   474|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|   964|        []|    964|          8|   964|
|SquaredCircle|OH MYNGOD MICHAEL...|               0|    2|[ally, knew, secr...|    1677|    0|  1677|        []|   1677|          6|  1677|
| MortalKomba

In [None]:
yy=yy.drop("id_tot", "id_cnt")

In [None]:
yy=yy.na.drop()

Dataset with total count and count of abusive words for each comment

In [None]:
yy.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|BullyWords|id_bull|Total_Count|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|        []|     29|          3|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|        []|    474|          8|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|        []|    964|          8|
|SquaredCircle|OH MYNGOD MICHAEL...|               0|    2|[ally, knew, secr...|    1677|    0|        []|   1677|          6|
| MortalKombat|Yeah I think mine...|               0|    5|[lol, okkk, learn...|    1697|    0|        []|   16

## Calculating the offensiveness Proportion for each comment

In [None]:
from pyspark.sql.functions import row_number,lit
import pyspark.sql.functions as F
from pyspark.sql.window import Window
df_qq = yy.withColumn("Proportion",F.col("Count")/F.col("Total_Count"))
df_qq=df_qq.na.drop()
df_qq.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|BullyWords|id_bull|Total_Count|Proportion|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|        []|     29|          3|       0.0|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|        []|    474|          8|       0.0|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|        []|    964|          8|       0.0|
|SquaredCircle|OH MYNGOD MICHAEL...|               0|    2|[ally, knew, secr...|    1677|    0|        []|   1677|          6|       0.0|
| MortalKombat|Yeah I think mine..

## Sentiment Analysis for each comment

In [None]:
pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 21.7 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 23.7 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 28.6 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 23.2 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 19.5 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 22.0 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 21.0 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 22.2 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 24.0 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 24.0 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 24.0 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 24.0 MB/s eta 0:00:01[K     |████████████████████████████████| 125 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

my_udf_sent = udf(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

df_sent = df_qq.withColumn("Sent",my_udf_sent(col("body")))
df_sent.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|BullyWords|id_bull|Total_Count|Proportion|                Sent|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|        []|     29|          3|       0.0|{neg=0.053, pos=0...|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|        []|    474|          8|       0.0|{neg=0.0, pos=0.1...|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|        []|    964|          8|       0.0|{neg=0.036, pos=0...|
|SquaredCircle|OH MYNGOD MICHAEL...|          

Extracting the compound score through regular expressions

In [None]:
from pyspark.sql.functions import split,regexp_extract

df_sent=df_sent.withColumn("compound", regexp_extract("Sent", "compound=(.*),", 1))
df_sent.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|BullyWords|id_bull|Total_Count|Proportion|                Sent|compound|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|        []|     29|          3|       0.0|{neg=0.053, pos=0...|  0.0258|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|        []|    474|          8|       0.0|{neg=0.0, pos=0.1...|  0.5023|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|        []|    964|          8|       0.0|{neg=0.036, pos=0...|  

In [None]:
from pyspark.sql.types import DecimalType
df_sent=df_sent.withColumn("compound", df_sent["compound"].cast(DecimalType(20,4)))
df_sent.show(5)

+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+
|    subreddit|                body|controversiality|score|               words|id_words|Count|BullyWords|id_bull|Total_Count|Proportion|                Sent|compound|
+-------------+--------------------+----------------+-----+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+
|          nba|[the only reason ...|               0|   -5|[invest, lot, arc...|      29|    0|        []|     29|          3|       0.0|{neg=0.053, pos=0...|  0.0258|
|       gaming|i don’t know who ...|               0|    3|[worry, it., ever...|     474|    0|        []|    474|          8|       0.0|{neg=0.0, pos=0.1...|  0.5023|
|marvelstudios|She's saying that...|               0|    3|[arya, wa, hand, ...|     964|    0|        []|    964|          8|       0.0|{neg=0.036, pos=0...|  

## Labeling Algorithm

In [None]:
from pyspark.sql.functions import udf, col, when

df_sent = df_sent.withColumn(
    'SentLab',
     when((col("compound").between(0.05, 1)) & col("Proportion").between(0.25, 1), 0)\
    .when((col("compound").between(-0.05, 1)) & col('Proportion').between(0,0.25), 0)\
    .when((col("compound").between(-1, -0.05)) & col('Proportion').between(0.25,1), 1)\
    .when((col("compound").between(-0.05, 0.05)) & col('Proportion').between(0.5,1), 1)\
    .otherwise(0)
)
df_sent.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+-------+
|           subreddit|                body|    controversiality|               score|               words|id_words|Count|BullyWords|id_bull|Total_Count|Proportion|                Sent|compound|SentLab|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+----------+-------+-----------+----------+--------------------+--------+-------+
|                 nba|[the only reason ...|                   0|                  -5|[invest, lot, arc...|      29|    0|        []|     29|          3|       0.0|{neg=0.053, pos=0...|  0.0258|      0|
|              gaming|i don’t know who ...|                   0|                   3|[worry, it., ever...|     474|    0|        []|    474|          8|       0.0|{neg=0.0, pos=0.1...|  0.5023

## Performing undersampling to tackle the class imbalance problem

In [None]:
from pyspark.sql.functions import col, explode, array, lit


major_df = df_sent.filter(col("SentLab") == 0)
minor_df = df_sent.filter(col("SentLab") == 1)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

sampled_majority_df = major_df.sample(withReplacement=False, fraction=1/ratio, seed=1)
combined_df_2 = sampled_majority_df.unionAll(minor_df)
# combined_df_2.show()

ratio: 404


In [None]:
minor_df = combined_df_2.filter(col("SentLab") == 1)


In [None]:
1/ratio

0.0024752475247524753

## Machine Learning

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel,LogisticRegressionWithSGD
from sklearn.model_selection import train_test_split


from sklearn.feature_extraction.text import TfidfTransformer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import LogisticRegression, NaiveBayes, GBTClassifier
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel,LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml import Pipeline

Train-test split

In [None]:
dfFin = combined_df_2.selectExpr("body as text", "SentLab as label")
(trainingData, testData) = dfFin.randomSplit([0.8, 0.2])

Feature Engineering

In [None]:
from pyspark.sql.types import IntegerType
def FeatureEng(data):
  sentData = data
#Tokenizing
  tokenizer = Tokenizer(inputCol="text", outputCol="words")
  wordsData = tokenizer.transform(sentData)
#Hashing
  hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=25)
  featData = hashingTF.transform(wordsData)
#IDF
  idf = IDF(inputCol="rawFeatures", outputCol="features")
  idfModel = idf.fit(featData)
  scaledData = idfModel.transform(featData)
 
  scaledData=scaledData.withColumn("label", scaledData["label"].cast(IntegerType()))
  return scaledData

In [None]:
train = FeatureEng(trainingData)
test = FeatureEng(testData)

Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train)
predictions = lrModel.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Mevaluator = MulticlassClassificationEvaluator()
Mevaluator.setPredictionCol("prediction")
acc = Mevaluator.evaluate(predictions, {Mevaluator.metricName: "accuracy"})
f1 = Mevaluator.evaluate(predictions, {Mevaluator.metricName: "f1"})
pre = Mevaluator.evaluate(predictions, {Mevaluator.metricName: "precisionByLabel"})
rec = Mevaluator.evaluate(predictions, {Mevaluator.metricName: "recallByLabel"})
# auc = Mevaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

print('Precision: %0.3f' % pre)
print('Recall: %0.3f' % rec)
print('Accuracy: %0.3f' % acc)
print('F1 score: %0.3f' % f1)
# print('AUC: %0.3f' % auc)

Precision: 0.333
Recall: 0.500
Accuracy: 0.438
F1 score: 0.444


SVM Classifier

In [None]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, \
                 regParam=0.1, \
                 featuresCol="features", \
                 labelCol="label")

lsvcModel = lsvc.fit(train)
predictions_lsvc = lsvcModel.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Mevaluator_lsvc = MulticlassClassificationEvaluator()
Mevaluator_lsvc.setPredictionCol("prediction")
acc_lsvc = Mevaluator_lsvc.evaluate(predictions_lsvc, {Mevaluator_lsvc.metricName: "accuracy"})
f1_lsvc = Mevaluator_lsvc.evaluate(predictions_lsvc, {Mevaluator_lsvc.metricName: "f1"})
pre_lsvc = Mevaluator_lsvc.evaluate(predictions_lsvc, {Mevaluator_lsvc.metricName: "precisionByLabel"})
rec_lsvc = Mevaluator_lsvc.evaluate(predictions_lsvc, {Mevaluator_lsvc.metricName: "recallByLabel"})
# auc_lsvc = Mevaluator_lsvc.evaluate(predictions_lsvc, {Mevaluator_lsvc.metricName: "areaUnderROC"})

print('Precision: %0.3f' % pre_lsvc)
print('Recall: %0.3f' % rec_lsvc)
print('Accuracy: %0.3f' % acc_lsvc)
print('F1 score: %0.3f' % f1_lsvc)
# print('AUC: %0.3f' % auc_lsvc)

Precision: 0.375
Recall: 0.500
Accuracy: 0.500
F1 score: 0.508


Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions_rf = rfModel.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Mevaluator_rf = MulticlassClassificationEvaluator()
Mevaluator_rf.setPredictionCol("prediction")
acc_rf = Mevaluator_rf.evaluate(predictions_rf, {Mevaluator_rf.metricName: "accuracy"})
f1_rf = Mevaluator_rf.evaluate(predictions_rf, {Mevaluator_rf.metricName: "f1"})
pre_rf = Mevaluator_rf.evaluate(predictions_rf, {Mevaluator_rf.metricName: "precisionByLabel"})
rec_rf = Mevaluator_rf.evaluate(predictions_rf, {Mevaluator_rf.metricName: "recallByLabel"})
# auc_rf = Mevaluator_rf.evaluate(predictions_rf, {Mevaluator_rf.metricName: "areaUnderROC"})


print('Precision: %0.3f' % pre_rf)
print('Recall: %0.3f' % rec_rf)
print('Accuracy: %0.3f' % acc_rf)
print('F1 score: %0.3f' % f1_rf)
# print('AUC: %0.3f' % auc_rf)

Precision: 0.333
Recall: 0.500
Accuracy: 0.438
F1 score: 0.444
