In [6]:
%%writefile ./nlp_process.py

# Standard library imports
import sys
import subprocess
# Install dependencies
subprocess.check_call([sys.executable, "-m", "pip", "install", "fsspec"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "s3fs"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "vaderSentiment"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "textblob"])

import os
import argparse
import logging
from itertools import chain

# External library imports
import pandas as pd
import numpy as np
import json
import fsspec
from nltk.corpus import stopwords

# PySpark imports
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col
from pyspark.sql.types import StringType, ArrayType, FloatType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer,
    HashingTF,

)
from pyspark.ml.feature import CountVectorizer , IDF, StopWordsRemover, Tokenizer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.clustering import LDA

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

## define the logger
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

def clean_text(df):
    # Lowercase all text
    df = df.withColumn("body", f.lower(f.col("body")))
    # Remove special characters (keeping only alphanumeric and spaces)
    df = df.withColumn("body", f.regexp_replace(f.col("body"), "[^a-zA-Z0-9\\s]", ""))
    # Trim spaces
    df = df.withColumn("body", f.trim(f.col("body")))
    return df

#get word from index of term 
def indices_to_terms(indices, terms):
    terms_subset = [terms[index] for index in indices]
    return terms_subset

def main():
    # parser = argparse.ArgumentParser(description="app inputs and outputs")
    # parser.add_argument("--s3_bucket", type=str, help="s3 bucket")
    # # parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    # parser.add_argument("--s3_data_submissions", type=str, help="s3 submissions prefix")
    # parser.add_argument("--s3_data_comments", type=str, help="s3 comments prefix")
    # args = parser.parse_args()
    spark = SparkSession.builder \
        .appName("Spark NLP")\
        .config("spark.driver.memory","16G")\
        .config("spark.driver.maxResultSize", "0") \
        .config("spark.kryoserializer.buffer.max", "2000M")\
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
        .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )


    # # load dataset
    submissions_path="s3a://sagemaker-us-east-1-165729782536/project/submissions/yyyy=*"
    logger.info(f"{submissions_path}")
    
    posts = spark.read.parquet(submissions_path, header=True)
    logger.info(f"posts count={posts.printSchema()}")
    
    s3_data_comments="s3a://sagemaker-us-east-1-165729782536/project/comments/yyyy=*"
    comments = spark.read.parquet(s3_data_comments, header=True)
    logger.info(f"comments count={comments.printSchema()}")
    #some preprocess for comments
    comments = clean_text(comments)
    comments = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda|propaganda|fraud|deceptive|fabricated|deep state|wake up|truth about'), True)\
                    .otherwise(False))
    logger.info(f"Finish data preprocess")
    
# #     # start LDA
#     logger.info(f"LDA Start!!!")
#     small_df = posts.select('title', 'id')
#     tokenizer = Tokenizer(inputCol="title", outputCol="words")
    
# #     #remove stop words 
#     StopWords = stopwords.words("english")
#     #removing stop words in other languages and other common words
#     additional = ['@reuters:', '–' '&amp;', '@ap:', 'rt', ':', 'از', 'آهنگ', 'دانلود', 'در', 'به', 'جدید', '@apentertainment:',
#                  '|', 'के', 'में', 'و', 'في', 'من', '@bbcworld:', 'de', 'la', 'di', 'की', 'से', 'bio', 'many','know', 'age', 'says', 'one',
#                  'net', 'user]', '[deleted', 'look', '–']
#     StopWords = StopWords + additional
#     remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=StopWords)
#     logger.info(f"StopWords setting done!!!")
    
#     #count vectorizer
#     cv = CountVectorizer(inputCol="filtered", outputCol="raw_features", vocabSize=5000, minDF=25)
#     idf = IDF(inputCol="raw_features", outputCol="features")
#     lda = LDA(k=8, maxIter=10, seed=2024)
#     logger.info(f"lda setting done!!!")

    
#     pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda])
#     model = pipeline.fit(small_df)
#     topics = model.stages[-1].describeTopics()
#     terms = model.stages[-3].vocabulary
    
#     # Defining Spark UDF from above function
#     udf_indices_to_terms = f.udf(indices_to_terms, ArrayType(StringType()))
#     topics = (
#         topics
#            .withColumn("terms", udf_indices_to_terms(f.col("termIndices"), f.lit(terms)))
#         )
#     #naming topics 
#     topic_dict = {0: 'economics/russia&ukraine', 1: 'presidental news', 2: 'supreme court/law', 3: 'global politics', 4: 'us politics', 
#                   5: 'covid/russia&ukraine', 6: 'crime/protest', 7: 'tv shows'}
#     small_transform = model.transform(small_df)
#     small_df.unpersist()
    
#     #map to topics
#     mapping_expr = f.create_map([f.lit(x) for x in chain(*topic_dict.items())])
#     #udf to get the top topic 
#     max_topic = f.udf(lambda v:float(v.argmax()),FloatType())
#     #using mao and udf to create a topic column
#     topic = small_transform.withColumn('topic_num', max_topic("topicDistribution"))\
#             .withColumn("topic", mapping_expr[f.col("topic_num")]).select('id','topic')
#     mini_posts = posts.select('created_utc', 'title', 'id')
#     merged_df = mini_posts.join(topic, 'id')
#     logger.info(f"LDA END!!!")
    
#     #renaming columns and removing the t3_ from the link id to get the post id on the comment
#     mini_comments = comments.select('created_utc','body','misinfo_class', 'link_id', 'id')\
#                     .withColumn('comment_created', f.col('created_utc')).withColumn('comment_id', f.col('id'))\
#                     .withColumn('id', f.regexp_extract('link_id', 't3_(.*)$', 1))
#     total_df = merged_df.join(mini_comments, 'id')
#     logger.info(f"total df schema={total_df.printSchema()}")
    
    
if __name__ == "__main__":
    main()

Overwriting nlp_process.py


In [None]:
%%time
import sagemaker
from sagemaker.spark.processing import PySparkProcessor
session = sagemaker.Session()
bucket = session.default_bucket()


# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-ml",
    framework_version="3.2",
    role=role,
    instance_count=4,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
# output_prefix_data_submissions = "project/submissions/yyyy=*"
# s3_data_submissions = f"s3a://{bucket}/{output_prefix_data_submissions}"
# print(f"s3_data_submissions={s3_data_submissions}")

# output_prefix_data_comments = "project/comments/yyyy=*"
# s3_data_comments = f"s3a://{bucket}/{output_prefix_data_comments}"
# print(f"s3_data_comments={s3_data_comments}")

output_prefix_data = f"sm-spark-ml-data"
output_prefix_logs = f"sm-spark-ml-spark_logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="./nlp_process.py",
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

In [3]:
# s3 paths
session = sagemaker.Session()
bucket = session.default_bucket()
bucket

'sagemaker-us-east-1-165729782536'

In [4]:


!wget -qO- https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-5.1.3.jar | aws s3 cp - s3://{bucket}/project/spark-nlp-assembly-5.1.3.jar
!aws s3 ls s3://{bucket}/project/spark-nlp-assembly-5.1.3.jar

2024-04-27 04:32:55  708534094 spark-nlp-assembly-5.1.3.jar


In [5]:
%%writefile ./test_process.py

import os
import sys
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def clean_text(df):
    # Lowercase all text
    df = df.withColumn("body", f.lower(f.col("body")))
    # Remove special characters (keeping only alphanumeric and spaces)
    df = df.withColumn("body", f.regexp_replace(f.col("body"), "[^a-zA-Z0-9\\s]", ""))
    # Trim spaces
    df = df.withColumn("body", f.trim(f.col("body")))
    return df

#get word from index of term 
def indices_to_terms(indices, terms):
    terms_subset = [terms[index] for index in indices]
    return terms_subset

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()
    logger.info(f"args={args}")
    
    spark = SparkSession.builder \
            .appName("Spark NLP")\
            .config("spark.driver.memory","16G")\
            .config("spark.driver.maxResultSize", "0") \
            .config("spark.kryoserializer.buffer.max", "2000M")\
            .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
            .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

     # # load dataset
    submissions_path=f"{args.s3_dataset_path}/submissions/yyyy=*"
    logger.info(f"submissions_path={submissions_path}")
    posts = spark.read.parquet(submissions_path, header=True)
    logger.info(f"posts count={posts.count()}")
    posts.printSchema()
    
    comments_path=f"{args.s3_dataset_path}/comments/yyyy=*"
    logger.info(f"comments path={comments_path}")
    comments = spark.read.parquet(comments_path, header=True)
    logger.info(f"comments count={comments.count()}")
    #some preprocess for comments
    comments = clean_text(comments)
    comments = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda|propaganda|fraud|deceptive|fabricated|deep state|wake up|truth about'), True)\
                    .otherwise(False))
    logger.info(f"Finish data preprocess")
    
    
if __name__ == "__main__":
    main()

Overwriting ./test_process.py


In [6]:
%%time
import boto3
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

account_id = boto3.client('sts').get_caller_identity()['Account']

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-project",
    image_uri=f"{account_id}.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark:latest",
    role=role,
    instance_count=2,
    # instance_type="ml.m5.large",
    instance_type="ml.t3.large",
    max_runtime_in_seconds=3600,
)


# s3_dataset_path = f"s3://{bucket}/lab8/news/data.parquet"
s3_dataset_path = f"s3://{bucket}/project/"
print(f"account_id={account_id}, s3_dataset_path={s3_dataset_path}")
output_prefix_data = f"project/data"
output_prefix_logs = f"project/spark_logs"


# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="./test_process.py",
        submit_jars=[f"s3://{bucket}/project/spark-nlp-assembly-5.1.3.jar"],
    arguments=[
        "--s3_dataset_path",
        s3_dataset_path,
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        output_prefix_data,
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

account_id=165729782536, s3_dataset_path=s3://sagemaker-us-east-1-165729782536/project/


INFO:sagemaker:Creating processing-job with name sm-spark-project-2024-04-27-04-33-20-019


..........................................................................................................................................................................................................................................................................!CPU times: user 1.52 s, sys: 209 ms, total: 1.73 s
Wall time: 22min 28s
