<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Natural%20Language%20Processing/Spark_NLP_%26_ML_for_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install library

In [None]:
# Install pyspark
! pip install pyspark --quiet

# Install Spark NLP
! pip install spark-nlp --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


Import modules

In [None]:

import os
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import expr
from pyspark.sql import functions as fun
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *
from pyspark.ml.classification import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *

In [None]:
# Start the Spark NLP session
spark = sparknlp.start()

Comprehensive News Articles Dataset



In [None]:
news_data = spark.read.csv('/content/News Dataset.csv', header = 'True',inferSchema='True')

In [None]:
news_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              source|              author|               title|         description|                 url|          urlToImage|         publishedAt|             content|  category|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|{'id': None, 'nam...|         Test Yessis|        test-setcion|test-setcionThis ...|https://consent.y...|                NULL|2024-05-14T06:19:29Z|click accept part...|technology|
|{'id': 'wired', '...|         Steven Levy|Don’t Let Mistrus...|It’s OK to be dou...|https://www.wired...|https://media.wir...|2024-06-07T13:00:00Z|seems evident alm...|technology|
|{'id': 'wired', '...|         Will Knight|OpenAI Offers a P...|Days after former...|https://ww

In [None]:
news_data.count()

799

Select the columns needed for analysis

In [None]:
title_category = news_data.select("content","category")

In [None]:
title_category.show()

+--------------------+----------+
|             content|  category|
+--------------------+----------+
|click accept part...|technology|
|seems evident alm...|technology|
|chatgpt developer...|technology|
|one night past fe...|technology|
|             removed|technology|
|lot overlap forme...|technology|
|                NULL|      NULL|
|                NULL|      NULL|
|                NULL|      NULL|
|openai unveiled g...|technology|
|microsoft getting...|technology|
|amazon reportedly...|technology|
|             removed|technology|
|             removed|technology|
|enlarge better wa...|technology|
|ted supported ads...|technology|
|click accept part...|technology|
|                NULL|      NULL|
|                NULL|      NULL|
|cnet money missio...|technology|
+--------------------+----------+
only showing top 20 rows



Let's check null values in TITLE and CATEGORY columns

In [None]:
from pyspark.sql.functions import col,isnull
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [None]:
null_columns_count_list = null_value_count(title_category)

In [None]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

+----------------------+-----------------+
|Column_With_Null_Value|Null_Values_Count|
+----------------------+-----------------+
|               content|              175|
|              category|              179|
+----------------------+-----------------+



Drop the null values

In [None]:
title_category = title_category.dropna()

In [None]:
title_category.count()

620

In [None]:
title_category.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|content                                                                                                                                                                       |category  |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|click accept partners including part iab transparency amp consent framework also store access information device words use chars                                              |technology|
|seems evident almost years first conference artificial intelligencewhere nascent fields leaders suggested task would completed within decadethe field chars                   |technology|
|chatgpt developer openais approach building artificial inte

Top 20 news categories

In [None]:
title_category.groupBy("category").count().orderBy(col("count").desc()).show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|category                                                                                                                                                           |count|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|health                                                                                                                                                             |95   |
|politics                                                                                                                                                           |90   |
|education                                                                                                                                  

Top 20 news title

In [None]:
title_category.groupBy("content").count().orderBy(col("count").desc()).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|content                                                                                                                                                                         |count|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|click accept partners including part iab transparency amp consent framework also store access information device words use chars                                                |32   |
|removed                                                                                                                                                                         |23   |
|edition atlantic daily newsletter guides biggest stories day helps discove

In [None]:
# Take a look at the schema of the Spark NLP dataframe
title_category.printSchema()

root
 |-- content: string (nullable = true)
 |-- category: string (nullable = true)



Randomly split the Spark NLP dataframe into train and test sets

In [None]:
train, test = title_category.randomSplit([0.8, 0.2], seed = 123)

Define the assembler

In [None]:
assembler = DocumentAssembler()\
  .setInputCol('content')\
  .setOutputCol('document')
docs = assembler.transform(title_category)
docs.limit(5).toPandas()

Unnamed: 0,content,category,document
0,click accept partners including part iab trans...,technology,"[(document, 0, 127, click accept partners incl..."
1,seems evident almost years first conference ar...,technology,"[(document, 0, 154, seems evident almost years..."
2,chatgpt developer openais approach building ar...,technology,"[(document, 0, 161, chatgpt developer openais ..."
3,one night past february drinks moody bar light...,technology,"[(document, 0, 152, one night past february dr..."
4,removed,technology,"[(document, 0, 6, removed, {'sentence': '0'}, ..."


# Define the sentence detector

In [None]:
sentence = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentences")
docs = sentence.transform(docs)
docs.limit(5).toPandas()

Unnamed: 0,content,category,document,sentences
0,click accept partners including part iab trans...,technology,"[(document, 0, 127, click accept partners incl...","[(document, 0, 127, click accept partners incl..."
1,seems evident almost years first conference ar...,technology,"[(document, 0, 154, seems evident almost years...","[(document, 0, 154, seems evident almost years..."
2,chatgpt developer openais approach building ar...,technology,"[(document, 0, 161, chatgpt developer openais ...","[(document, 0, 161, chatgpt developer openais ..."
3,one night past february drinks moody bar light...,technology,"[(document, 0, 152, one night past february dr...","[(document, 0, 152, one night past february dr..."
4,removed,technology,"[(document, 0, 6, removed, {'sentence': '0'}, ...","[(document, 0, 6, removed, {'sentence': '0'}, ..."



# Define the word tokenizer

In [None]:
tokenizer = Tokenizer()\
  .setInputCols(['sentences'])\
  .setOutputCol('tokens')

# Define the lemmetizer

In [None]:
lemmatizer = LemmatizerModel.pretrained()\
  .setInputCols(['tokens'])\
  .setOutputCol('lemma')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


# Define the normalizer

In [None]:
normalizer = Normalizer()\
  .setInputCols(['lemma'])\
  .setOutputCol('normalized')\
  .setLowercase(True)

# Define the finisher

In [None]:
finisher = Finisher()\
  .setInputCols(['normalized'])\
  .setOutputCols(['normalized'])\
  .setOutputAsArray(True)

# Define the stop words

In [None]:
stopwords = set(StopWordsRemover.loadDefaultStopWords('english'))
print(stopwords)

# Define the stopword remover
sw_remover = StopWordsRemover()\
  .setInputCol('normalized')\
  .setOutputCol('filtered')\
  .setStopWords(list(stopwords))

{"he's", "there's", 'off', 'your', 'is', 'an', 'through', 'of', 'under', "she'll", "weren't", "i've", 'could', 'again', "hasn't", "they'll", "shan't", 'a', 'my', 'how', 'i', 'in', 'the', 'as', 'about', 'nor', "you've", "won't", "hadn't", 'here', 'our', 'they', 'these', "they'd", 'are', 'yourself', 'then', 'than', 'has', "wouldn't", 'any', 'while', "can't", 'will', "wasn't", 'same', "couldn't", "mustn't", 'would', "didn't", 'up', 'after', 'his', 'ours', "aren't", 'hers', 'and', 'into', 'each', 'by', 'be', 'down', 'am', 't', "you'd", 'ourselves', 'above', 'there', "i'd", 'too', 'their', 'should', 'she', 'out', 'being', 'during', 'few', "it's", 'at', "don't", "you're", 'to', "why's", 'do', 'himself', "doesn't", 'him', "that's", 'other', "he'll", 'or', 'further', "where's", "let's", "isn't", "who's", 'those', 'we', 'yours', 'that', 'theirs', 'had', 'with', "she's", 'why', 'doing', 'from', "when's", 'more', 'did', 'but', 'such', 'not', "they're", 'once', "we'll", 'was', 'it', 'only', 'have'

# Define count vectorizer

In [None]:
count_vectorizer = CountVectorizer(
  inputCol = 'filtered',
  outputCol = 'tf', minDF = 10
  )

# Define TF-IDF

In [None]:
tfidf = IDF(
  inputCol = 'tf',
  outputCol = 'tfidf',
  minDocFreq = 10
  )

# Define the pipeline

In [None]:
text_processing_pipeline = Pipeline(
  stages = [
    assembler,
    sentence,
    tokenizer,
    lemmatizer,
    normalizer,
    finisher,
    sw_remover,
    count_vectorizer,
    tfidf
   ])

# Define a function to convert labels to indices and use it to fit the data

In [None]:
label_indexer = StringIndexer(
  inputCol = 'category',
  outputCol = 'label').fit(title_category)

# Take a look at the indexed labels

In [None]:
label_indexer.labels

['health',
 'politics',
 'education',
 'technology',
 'finance',
 'entertainment',
 'sports',
 '2024-05-15T22:58:23Z',
 '2024-05-17T19:53:33Z',
 '2024-05-19T14:34:00Z',
 '2024-05-26T03:34:00Z',
 '2024-05-28T17:01:14Z',
 '2024-06-03T14:49:07Z',
 'apple set release new immersive video vision pro friday parkour labeled episode two adventure series episode one highlining video shows highliner chars',
 'bill gates may sean gallup getty images ul li bill gates unveiled annual summer read watch list li li gates said four books one tv show touch idea service chars',
 'citadel ceo ken griffin apu gomes via getty images ul li anti israel protesters engaging form performance art says citadel ceo ken griffin li li freedom speech gi chars',
 'click accept partners including part iab transparency amp consent framework also store access information device words use chars',
 'dylan cope gp referred hospital suspected appendicitis doctors nurses discharged flu missing note nine year old boy died sepsis

# Define a function to convert predicted indices to labels

In [None]:
prediction_deindexer = IndexToString(
  inputCol = 'prediction',
  outputCol = 'pred_label',
  labels = label_indexer.labels)

# Define a model

In [None]:
naive_bayes = NaiveBayes(featuresCol = 'tfidf')

# Define the overall pipeline

In [None]:
pipeline = Pipeline(
  stages = [
    text_processing_pipeline,
    label_indexer,
    naive_bayes,
    prediction_deindexer
  ])

# Train the model

In [None]:
model = pipeline.fit(train)

# Make predictions on the train and test sets

In [None]:
train_predicted = model.transform(train)
test_predicted = model.transform(test)

# Innitiate the F1 score

In [None]:
evaluator = MulticlassClassificationEvaluator(metricName = 'f1')

# Report the F1 score on the train set

In [None]:
print('f1 on train set', evaluator.evaluate(train_predicted))

f1 on train set 0.59974129823327


# Report the F1 score on the test set

In [None]:
print('f1 on test set', evaluator.evaluate(test_predicted))

f1 on test set 0.4213841150774357


# Define a function to try out different ML models

In [None]:
def fit_model(model):

  # Define the type of model
  model_type = model(featuresCol = 'tfidf')

  # Construct the overall pipeline
  pipeline = Pipeline(
  stages = [
    text_processing_pipeline,
    label_indexer,
    model_type,
    prediction_deindexer
  ])

  # Train the model
  model = pipeline.fit(train)

  # Make predictions on the train and test sets
  train_predicted = model.transform(train)
  test_predicted = model.transform(test)

  # Innitiate the F1 score
  evaluator = MulticlassClassificationEvaluator(metricName = 'f1')

  # Report the F1 score on the train set
  print(f'{str(model)}: f1 on train set', evaluator.evaluate(train_predicted))

  # Report the F1 score on the test set
  print(f'{str(model)}: f1 on test set', evaluator.evaluate(test_predicted))


In [None]:
# Fit a logistic regression
fit_model(LogisticRegression)

PipelineModel_d600b5790224: f1 on train set 0.8540545747080798
PipelineModel_d600b5790224: f1 on test set 0.3409348230424027


In [None]:
# Fit a random forest classifier
fit_model(RandomForestClassifier)

PipelineModel_bb5a748961df: f1 on train set 0.5188905644433935
PipelineModel_bb5a748961df: f1 on test set 0.3542693021152274
