In [1]:
#! pip install -q pyspark==3.3.0 spark-nlp==4.2.8


In [2]:
# import libraries

import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [3]:
# Create Spark Session

spark = sparknlp.start()
print ("Spark NLP Version :", sparknlp.version())


23/04/27 20:20:04 WARN Utils: Your hostname, samyuktha.local resolves to a loopback address: 127.0.0.1; using 192.168.0.101 instead (on interface en0)
23/04/27 20:20:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/samyuktha/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/samyuktha/.ivy2/cache
The jars for the packages stored in: /Users/samyuktha/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8b85d76f-aae9-4ec9-8d14-3171370b3e05;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.8 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.15.0 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guav

23/04/27 20:20:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark NLP Version : 4.2.8


In [4]:
#Load the dataset

file_path = "../data/"

file_type = "csv"


# CSV options

infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
multiLine=True
escape='"'

# Read the file

df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
    .option("multiLine", multiLine) \
    .option("escape", escape) \
  .load(file_path+"train.csv")

# Verify the count

df.count()



50000

In [5]:
# Spark NLP requires the input dataframe or column to be converted to document.
 
document_assembler = DocumentAssembler() \
    .setInputCol("news_title") \
    .setOutputCol("document") \
    .setCleanupMode("shrink")

# Split sentence to tokens(array)

tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

# Lemma is the process of converting a word to its base form. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.

lemmatizer = LemmatizerModel.pretrained("lemma_ttb", "ta").setInputCols(["token"]).setOutputCol("lemma")

# remove stop words

stopwords_cleaner = StopWordsCleaner.pretrained("stopwords_iso","ta").setInputCols("lemma").setOutputCol("cleanTokens")


# Finisher is the most important annotator. Spark NLP adds its own structure when we convert each row in the dataframe to document. Finisher helps us to bring back the expected structure viz. array of tokens.

finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

# We build a ml pipeline so that each phase can be executed in sequence. This pipeline can also be used to test the model. 

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            lemmatizer,
            stopwords_cleaner,
            finisher])


lemma_ttb download started this may take some time.
Approximate size to download 96.5 KB
[ / ]lemma_ttb download started this may take some time.
Approximate size to download 96.5 KB
Download done! Loading the resource.
[OK!]
stopwords_iso download started this may take some time.
Approximate size to download 1.9 KB
[ | ]stopwords_iso download started this may take some time.
Approximate size to download 1.9 KB
Download done! Loading the resource.
[OK!]


In [6]:
# train the pipeline

nlp_model = nlp_pipeline.fit(df)

# apply the pipeline to transform dataframe.

processed_df  = nlp_model.transform(df)



In [7]:


# nlp pipeline create intermediary columns that we dont need. We can drop them.

tokens_df = processed_df.select('news_date','tokens')
tokens_df.show()



+-------------------+--------------------+
|          news_date|              tokens|
+-------------------+--------------------+
|1/6/2011 2:45:49 PM|[தூக்கில், தொங்கு...|
|1/6/2011 2:56:51 PM|[பவுர்ணமி, ஜாமத்த...|
|1/6/2011 3:08:15 PM|[மச்சுபிச்சு, மலை...|
|1/6/2011 3:09:20 PM|[ரத்த, பலி, வாங்க...|
|1/6/2011 3:11:00 PM|[உலகப், பேரழகியின...|
|1/6/2011 3:17:03 PM|[அமாவாசை, இருட்டி...|
|1/6/2011 3:17:57 PM|[நடுக்கடலில், மிர...|
|1/6/2011 3:19:43 PM|             [பூதம்]|
|1/6/2011 3:23:57 PM|[காவிரி, கரையில்,...|
|1/6/2011 3:25:59 PM|[இருட்டில், துரத்...|
|1/6/2011 3:26:40 PM|[கதிகலங்க, வைக்கு...|
|1/6/2011 3:32:36 PM|[கல்லறையை, காவல்,...|
|1/6/2011 3:40:26 PM|[காக்க, காக்க, கன...|
|1/6/2011 3:41:21 PM|   [வைகுண்ட, ஏகாதசி]|
|1/6/2011 3:42:08 PM|[கிரகங்கள், அருளு...|
|1/6/2011 3:42:20 PM|[நியூசிபாக், டெஸ்...|
|1/6/2011 3:42:58 PM|[துர்க்கா, தேவியை...|
|1/6/2011 3:43:31 PM|[நவராத்திரி, சுபர...|
|1/6/2011 3:44:14 PM|[ஆறு, பலங்கள், தர...|
|1/6/2011 3:44:45 PM|[டெஸ்ட், போட்டிக்...|
+----------

In [8]:
# Convert the array of tokens to string

df1=tokens_df.withColumn("tokens", F.array_join(F.col("tokens"), ","))


In [9]:

# Save the processed data to csv

df1.toPandas().to_csv(file_path+"processed_data.csv", index=False)



                                                                                