In [None]:
! pip install -q pyspark==3.1.2 spark-nlp

[K     |████████████████████████████████| 212.4 MB 68 kB/s 
[K     |████████████████████████████████| 133 kB 49.5 MB/s 
[K     |████████████████████████████████| 198 kB 49.7 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import sparknlp

spark = sparknlp.start(gpu = True) # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd


print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.3.4
Apache Spark version: 3.1.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("/content/drive/MyDrive/bert.csv")

trainDataset.show(truncate=50)

+--------------------------------------------------+---------+
|                                     clean_comment| category|
+--------------------------------------------------+---------+
|                                    sick this shit|depressed|
|                                        will pass |depressed|
|                                    need some help|depressed|
|                                       need reason|depressed|
|                                  hit rock bottom |depressed|
|                     alone depressed real friends |    happy|
|                             should consider meds |depressed|
|                     want die but won  kill myself|depressed|
|                           heart physically hurts |depressed|
|                            should happy but   not|    happy|
|                            feel absolutely awful |depressed|
| feel empty even though should happy and going ...|depressed|
|               feeling really depressed and lonely|   

In [None]:
trainDataset.count()

7139

In [None]:
from pyspark.sql.functions import col

trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+---------+-----+
| category|count|
+---------+-----+
|depressed| 2365|
|     calm| 2017|
|    happy| 1635|
|    angry| 1122|
+---------+-----+



In [None]:
(trainingData, testData) = trainDataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 4999
Test Dataset Count: 2140


In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("clean_comment") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(3)\
      .setEnableOutputLogs(True)
      #.setOutputLogsPath('logs')

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
clf_pipelineModel = clf_pipeline.fit(trainDataset)

In [None]:
preds = clf_pipelineModel.transform(testData)

In [None]:
preds_df.sample(10)

Unnamed: 0,category,clean_comment,result
850,happy,don’ forget smile even life seems like swing w...,angry
133,angry,assassination millennial,angry
563,depreesed,how you know you’ unhappy with your relationsh...,angry
196,angry,google searches lately,angry
411,depreesed,got kicked out around months ago…,angry
779,happy,happy for the first time long time,angry
98,angry,stuff makes mad instead smashing furniture il...,angry
12,angry,bit jealous,angry
34,angry,found out “ short hike” the name video game a...,angry
140,angry,biggest wtf job search,angry


In [None]:
preds.select('category','clean_comment',"class.result").show(10, truncate=80)

+---------+-------------+------+
| category|clean_comment|result|
+---------+-------------+------+
|     calm|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|depressed|         null|[calm]|
|     calm|             |[calm]|
|     calm|             |[calm]|
+---------+-------------+------+
only showing top 10 rows



In [None]:
preds_df = preds.select('category','clean_comment',"class.result").toPandas()

In [None]:
preds_df['result'] = preds_df.result.apply(lambda x: ', '.join([str(i) for i in x]))

In [None]:
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


              precision    recall  f1-score   support

       angry       0.72      0.64      0.68       376
        calm       0.66      0.75      0.70       515
   depressed       0.68      0.71      0.70       674
       happy       0.80      0.72      0.76       575

    accuracy                           0.71      2140
   macro avg       0.71      0.71      0.71      2140
weighted avg       0.71      0.71      0.71      2140



In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("clean_comment") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
      
bert_embeddings = BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
    .setInputCols(["document",'token'])\
    .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(10)\
    .setLr(0.001)\
    .setBatchSize(8)\
    .setEnableOutputLogs(True)
    #.setOutputLogsPath('logs')

bert_clf_pipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    bert_embeddings,
    embeddingsSentence,
    classsifierdl
])

small_bert_L4_256 download started this may take some time.
Approximate size to download 40.5 MB
[OK!]


In [None]:
bert_clf_pipelineModel = bert_clf_pipeline.fit(trainDataset)

In [None]:
from sklearn.metrics import classification_report

preds = bert_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','clean_comment',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['category']))

              precision    recall  f1-score   support

       angry       0.00      0.00      0.00         0
        calm       0.08      0.52      0.15        95
   depressed       0.98      0.34      0.50      2045
       happy       0.00      0.00      0.00         0

    accuracy                           0.35      2140
   macro avg       0.27      0.21      0.16      2140
weighted avg       0.94      0.35      0.49      2140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# actual content is inside description column
document = DocumentAssembler()\
      .setInputCol("clean_comment")\
      .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

bert_sent = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512')\
      .setInputCols(["document"])\
      .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(10)\
      .setEnableOutputLogs(True)\
      .setLr(0.001)

bert_sent_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])



sent_small_bert_L8_512 download started this may take some time.
Approximate size to download 149.1 MB
[OK!]


In [None]:
%%time
bert_sent_pipelineModel = bert_sent_clf_pipeline.fit(trainDataset)

CPU times: user 1.84 s, sys: 227 ms, total: 2.06 s
Wall time: 6min 21s


In [None]:
from sklearn.metrics import classification_report

preds = bert_sent_pipelineModel.transform(testData)

preds_df = preds.select('category','clean_comment',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['category']))

              precision    recall  f1-score   support

       angry       0.69      0.71      0.70       329
        calm       0.62      0.73      0.67       490
   depressed       0.79      0.67      0.73       820
       happy       0.74      0.77      0.75       501

    accuracy                           0.71      2140
   macro avg       0.71      0.72      0.71      2140
weighted avg       0.72      0.71      0.72      2140



In [None]:
document = DocumentAssembler()\
      .setInputCol("clean_comment")\
      .setOutputCol("document")
    
bert_cmlm = BertSentenceEmbeddings.pretrained('sent_bert_use_cmlm_en_base', 'en')\
      .setInputCols(["document"])\
      .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(10)\
      .setEnableOutputLogs(True)\
      .setLr(0.001)

bert_cmlm_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_cmlm,
        classsifierdl
    ])

sent_bert_use_cmlm_en_base download started this may take some time.
Approximate size to download 391.6 MB
[OK!]


In [None]:
bert_cmlm_pipelineModel = bert_cmlm_clf_pipeline.fit(trainDataset)

In [None]:
from sklearn.metrics import classification_report

preds = bert_cmlm_pipelineModel.transform(testData)

preds_df = preds.select('category','clean_comment',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['category']))

              precision    recall  f1-score   support

       angry       0.12      0.91      0.21        45
        calm       0.79      0.65      0.71       708
   depressed       0.83      0.65      0.73       895
       happy       0.77      0.82      0.79       492

    accuracy                           0.69      2140
   macro avg       0.63      0.76      0.61      2140
weighted avg       0.79      0.69      0.73      2140

