In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.7.1

debconf: delaying package configuration, since apt-utils is not installed
openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)


In [3]:
import sparknlp

spark = sparknlp.start(gpu = True) # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 2.7.1
Apache Spark version: 2.4.4


## Read the training data 

In [4]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("/kaggle/input/nlp-getting-started/train.csv")

trainDataset.show(truncate=50)

+---+-------+--------+--------------------------------------------------+------+
| id|keyword|location|                                              text|target|
+---+-------+--------+--------------------------------------------------+------+
|  1|   null|    null|Our Deeds are the Reason of this #earthquake Ma...|     1|
|  4|   null|    null|            Forest fire near La Ronge Sask. Canada|     1|
|  5|   null|    null|All residents asked to 'shelter in place' are b...|     1|
|  6|   null|    null|13,000 people receive #wildfires evacuation ord...|     1|
|  7|   null|    null|Just got sent this photo from Ruby #Alaska as s...|     1|
|  8|   null|    null|#RockyFire Update => California Hwy. 20 closed ...|     1|
| 10|   null|    null|#flood #disaster Heavy rain causes flash floodi...|     1|
| 13|   null|    null|I'm on top of the hill and I can see a fire in ...|     1|
| 14|   null|    null|There's an emergency evacuation happening now i...|     1|
| 15|   null|    null|I'm af

In [None]:
trainDataset.count()

## identify cases of missing target 

In [5]:
from pyspark.sql.functions import col

trainDataset.groupBy("target") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
|target|count|
+------+-----+
|     0| 4095|
|     1| 3081|
|  null| 1211|
+------+-----+



## check for text > 512 in length

In [None]:
from pyspark.sql.functions import length
trainDataset.where(length(col("text")) > 512).show()

## drop missing values from the text and target columns

In [6]:
train = trainDataset.dropna(subset=['text', 'target'])

In [7]:
train.groupBy("target") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
|target|count|
+------+-----+
|     0| 4095|
|     1| 3081|
+------+-----+



## preprocessing pipeline

In [None]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document") \
.setCleanupMode("shrink")
    
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
.setSplitChars(['-']) \
.setContextChars(['(', ')', '?', '!', '#', '@']) 

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"]) 

stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")

preproc_pipeline = Pipeline(
  stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma])

## visualize results of the steps in pipeline

In [None]:
pipelineModel = preproc_pipeline.fit(train)
%time result = pipelineModel.transform(train).collect()

In [None]:
from pyspark.sql import Row

text = "As she sat watching the world go by, something caught her eye. It wasn't so much its color or shape, but the way it was moving."
df = spark.createDataFrame(list(map(lambda x: Row(text=x), [text])), ["text"])
%time result = pipelineModel.transform(df).collect()

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = preproc_pipeline.fit(empty_df)


In [None]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(pipelineModel)

%time light_result = light_model.annotate("As she sat watching the world go by, something caught her eye. It wasn't so much its color or shape, but the way it was moving.")

In [None]:
light_result.keys()

In [None]:
list(zip(light_result['token'], light_result['normalized'], light_result['cleanTokens'],  light_result['lemma']))

## Classification using GloVe

In [None]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document") \
.setCleanupMode("shrink")
    
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
.setSplitChars(['-']) \
.setContextChars(['(', ')', '?', '!', '#', '@']) 

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"]) 

stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
.setInputCols(["document",'lemma'])\
.setOutputCol("embeddings")\
.setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("class")\
.setLabelColumn("target")\
.setMaxEpochs(5)\
.setLr(0.001)\
.setBatchSize(8)\
.setEnableOutputLogs(True)
#.setOutputLogsPath('logs')

glove_clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

## split training data into train/validation sets

In [8]:
(train_df, val_df) = train.randomSplit([0.7, 0.3], seed = 8)
print("Training Dataset Count: " + str(train_df.count()))
print("Validation Dataset Count: " + str(val_df.count()))

Training Dataset Count: 5036
Validation Dataset Count: 2140


In [None]:
glove_clf_pipelineModel = glove_clf_pipeline.fit(train_df)

In [None]:
!cd ~/annotator_logs && ls -l

In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_b5fa4fd51592.log

In [None]:
# get the predictions on validation Set

preds = glove_clf_pipelineModel.transform(val_df)

In [None]:
preds.select('text','target',"class.result").show(10, truncate=80)

In [None]:
preds_df = preds.select('text','target',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['target']))

## Classification using Elmo


In [9]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document") \
.setCleanupMode("shrink")
    
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
.setSplitChars(['-']) \
.setContextChars(['(', ')', '?', '!', '#', '@']) 

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"]) 

stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")
    
elmo_embeddings = ElmoEmbeddings.pretrained('elmo')\
.setInputCols(["document", "token"])\
.setOutputCol("embeddings")\
.setPoolingLayer('elmo')# default --> elmo

embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("class")\
.setLabelColumn("target")\
.setMaxEpochs(5)\
.setLr(0.001)\
.setBatchSize(8)\
.setEnableOutputLogs(True)
#.setOutputLogsPath('logs')

elmo_clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemma,
            elmo_embeddings,
            embeddingsSentence,
            classsifierdl])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [10]:
elmo_clf_pipelineModel = elmo_clf_pipeline.fit(train_df)

KeyboardInterrupt: 

In [None]:
!cd /root/annotator_logs && ls -lt

In [None]:
!cat /root/annotator_logs/ClassifierDLApproach_72aceb5a2cde.log

In [None]:
preds = elmo_clf_pipelineModel.transform(val_df)

In [None]:
preds.select('text','target',"class.result").show(10, truncate=80)

In [None]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report



preds_df = preds.select('text','target',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['target']))

## Classification using BERT


In [18]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document") \
.setCleanupMode("shrink")
    
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
.setSplitChars(['-']) \
.setContextChars(['(', ')', '?', '!', '#', '@']) 

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"]) 

stopwords_cleaner = StopWordsCleaner()\
.setInputCols("normalized")\
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
.setInputCols(["cleanTokens"]) \
.setOutputCol("lemma")

bert_embeddings = BertEmbeddings().pretrained(name='bert_base_cased', lang='en') \
.setInputCols(["document",'token'])\
.setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("class")\
.setLabelColumn("target")\
.setMaxEpochs(5)\
.setLr(0.001)\
.setBatchSize(8) \
.setEnableOutputLogs(True)
#.setOutputLogsPath('logs')

bert_clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemma,
            bert_embeddings,
            embeddingsSentence,
            classsifierdl])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [14]:
bert_clf_pipelineModel = bert_clf_pipeline.fit(train_df)

In [None]:
!cd /root/annotator_logs && ls -lt

In [None]:
!cat /root/annotator_logs/ClassifierDLApproach_5571ae93049a.log

In [15]:
preds = bert_clf_pipelineModel.transform(val_df)

In [16]:
preds.select('text','target',"class.result").show(10, truncate=80)

+--------------------------------------------------------------------------------+------+------+
|                                                                            text|target|result|
+--------------------------------------------------------------------------------+------+------+
|http://t.co/GKYe6gjTk5 Had a #personalinjury accident this summer? Read our a...|     0|   [0]|
|                                                    I want some tsunami take out|     0|   [0]|
|Just stop fucking saying ÛÏa whole Û÷notherÛ. It just sounds fucking stup...|     0|   [0]|
|Brain twister homefolks are opinionated over against proposal modernized cana...|     0|   [0]|
|Crazy Mom Threw Teen Daughter a NUDE Twister Sex Party According To Her Frien...|     0|   [0]|
|     The Sharper Image Viper 24' Hardside Twister (Black) http://t.co/FXk3zsj2PE|     0|   [0]|
|                         Brain twister let drop up telly structuring cast: EDcXO|     0|   [0]|
|                            @

In [17]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report



preds_df = preds.select('text','target',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['target']))

              precision    recall  f1-score   support

           0       1.00      0.57      0.72      2140
           1       0.00      0.00      0.00         0

    accuracy                           0.57      2140
   macro avg       0.50      0.28      0.36      2140
weighted avg       1.00      0.57      0.72      2140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## ClassifierDL with universal sentence embeddings

In [46]:
# actual content is inside text column
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("target")\
  .setMaxEpochs(5)\
  .setLr(0.001)\
  .setBatchSize(8)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [47]:
use_pipelineModel = use_clf_pipeline.fit(train_df)

In [None]:
!cd /root/annotator_logs && ls -lt

In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_6b6f3f75388e.log

In [48]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

preds = use_pipelineModel.transform(val_df)

preds_df = preds.select('text','target',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['target']))

              precision    recall  f1-score   support

           0       0.88      0.81      0.85      1322
           1       0.73      0.83      0.78       818

    accuracy                           0.82      2140
   macro avg       0.81      0.82      0.81      2140
weighted avg       0.83      0.82      0.82      2140



## With BERT sentence embeddings

In [49]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence
#bert_sent = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512')\
bert_sent = BertSentenceEmbeddings.pretrained('sent_bert_base_cased')\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("target")\
  .setMaxEpochs(5)\
  .setLr(0.001)\
  .setBatchSize(8)\
  .setEnableOutputLogs(True)

bert_sent_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

sent_bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [51]:
bert_sent_pipelineModel = bert_sent_clf_pipeline.fit(train_df)

In [None]:
!cd /root/annotator_logs && ls -lt

In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_29562c9c6757.log

In [52]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

preds = bert_sent_pipelineModel.transform(val_df)

preds_df = preds.select('text','target',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['target']))

              precision    recall  f1-score   support

           0       0.87      0.82      0.84      1303
           1       0.74      0.82      0.78       837

    accuracy                           0.82      2140
   macro avg       0.81      0.82      0.81      2140
weighted avg       0.82      0.82      0.82      2140

