In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ag-news-classification-dataset/train.csv
/kaggle/input/ag-news-classification-dataset/test.csv


In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.7.1

debconf: delaying package configuration, since apt-utils is not installed
openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)


In [3]:
import sparknlp

spark = sparknlp.start(gpu = True) # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 2.7.1
Apache Spark version: 2.4.4


In [4]:
df = spark.read \
      .option("header", True) \
      .csv("/kaggle/input/ag-news-classification-dataset/train.csv")

df.show(truncate=50)

+-----------+--------------------------------------------------+--------------------------------------------------+
|Class Index|                                             Title|                                       Description|
+-----------+--------------------------------------------------+--------------------------------------------------+
|          3| Wall St. Bears Claw Back Into the Black (Reuters)|Reuters - Short-sellers, Wall Street's dwindlin...|
|          3|Carlyle Looks Toward Commercial Aerospace (Reut...|Reuters - Private investment firm Carlyle Group...|
|          3|   Oil and Economy Cloud Stocks' Outlook (Reuters)|Reuters - Soaring crude prices plus worries\abo...|
|          3|Iraq Halts Oil Exports from Main Southern Pipel...|Reuters - Authorities have halted oil export\fl...|
|          3|Oil prices soar to all-time record, posing new ...|AFP - Tearaway world oil prices, toppling recor...|
|          3|       Stocks End Up, But Near Year Lows (Reuters)|Reuters 

In [5]:
df.count()

120000

In [7]:
from pyspark.sql.functions import col

df.groupBy("Class Index") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----------+-----+
|Class Index|count|
+-----------+-----+
|          1|30000|
|          4|30000|
|          3|30000|
|          2|30000|
+-----------+-----+



In [8]:
(train_df, val_df) = df.randomSplit([0.7, 0.3], seed = 8)
print("Training Dataset Count: " + str(train_df.count()))
print("Validation Dataset Count: " + str(val_df.count()))

Training Dataset Count: 84042
Validation Dataset Count: 35958


In [9]:
# actual content is inside description column
document = DocumentAssembler() \
.setInputCol("Description") \
.setOutputCol("document") \
.setCleanupMode("shrink")

bert = BertSentenceEmbeddings.pretrained('sent_bert_base_cased') \
.setInputCols("document") \
.setOutputCol("bert_sentence_embeddings") \
.setLazyAnnotator(False)

# the classes/labels/categories are in category column
classifierdl = ClassifierDLApproach()\
.setInputCols(["bert_sentence_embeddings"])\
.setOutputCol("class")\
.setLabelColumn("Class Index")\
.setMaxEpochs(5)\
.setLr(0.001)\
.setBatchSize(8)\
.setEnableOutputLogs(True)
#.setOutputLogsPath('logs')

pipeline = Pipeline(
    stages = [
        document,
        bert,
        classifierdl
    ])

sent_bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
%%time
pipelineModel = pipeline.fit(train_df)

In [None]:
# get the predictions on validation Set

preds = pipelineModel.transform(val_df)

In [None]:
preds.select('Description','Class Index',"class.result").show(10, truncate=80)

In [None]:
preds_df = preds.select('Description','Class Index',"class.result").toPandas()

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['Class Index']))