Ryan Leeson, Keith Jennings

In [2]:
#! scancel -u ryan.leeson -n sparkcluster 

In [2]:
import os, atexit, sys, findspark, sparkhpc, pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext

findspark.init()

# specify your partition (unless you're OK with default)
os.environ['SBATCH_PARTITION']='cpu24'

sj = sparkhpc.sparkjob.sparkjob(
    ncores = 10,                       # total number or cores
    cores_per_executor = 5,            # parallelism of two executor
    memory_per_core = 10240,           # memory per core in MB 
    walltime = "4:0"                   # hh:mm format
)

sj.wait_to_start()
sc = sj.start_spark()
scq = SQLContext(sc)

def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass
atexit.register(exitHandler,sj,sc);
sc

INFO:sparkhpc.sparkjob:Submitted batch job 8835

INFO:sparkhpc.sparkjob:Submitted cluster 2


In [3]:
from textblob import TextBlob

from pyspark.ml.feature import StopWordsRemover

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf, concat
from pyspark.sql import functions as f

from pyspark.ml import Pipeline

from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *

from pyspark.sql.functions import split, lower

from pyspark.ml.feature import NGram, FeatureHasher, CountVectorizer, RegexTokenizer

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler


from itertools import chain
import numpy as np
import pandas as pd


from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  
from pyspark.sql.functions import udf, create_map, lit
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.window import Window

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
df = scq.read.parquet ('Files/amazon_reviews.parquet').select ('overall', 'reviewText')

In [5]:
df.show (3)

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0|The King, the Mic...|
|    5.0|  The kids loved it!|
|    5.0|My students (3 & ...|
+-------+--------------------+
only showing top 3 rows



In [6]:
print (f'There are {df.count ()} entries in the dataset.')

There are 75257650 entries in the dataset.


In [6]:
(df.groupBy ('overall')
 .count ()
 .withColumn ('percent', f.col ('count') / f.sum ('count').over (Window.partitionBy ()))
 .orderBy ('overall', ascending = False)
 #.show ()
)

DataFrame[overall: double, count: bigint, percent: double]

In [7]:
#   Remove entries with duplicates, oveall < 1, and NA values in 'reviewText'

df_cleaned = df.where (df.overall > 0).dropDuplicates ().na.drop (subset = ['reviewText'])

print (f'After cleaning there are {df_cleaned.count ()} entries remaining.')

After cleaning there are 62454271 entries remaining.


In [8]:
df_counts = (df_cleaned.groupBy ('overall')
 .count ()
 .withColumn ('percent', f.col ('count') / f.sum ('count').over (Window.partitionBy ()))
 .orderBy ('overall', ascending = False)
)

#display (df_counts.toPandas ())

In [9]:
#   better way to find fractions

df_frac = df_counts.toPandas ()

df_frac['fraction'] = df_frac['count'].apply (lambda x: df_frac['count'].min () / x)

fractions = df_frac[['overall', 'fraction']].set_index ('overall').T.to_dict ('record')[0]
#fractions

In [10]:

df_equal = df_cleaned.sampleBy ('overall', fractions = fractions).cache ()

(training, testing) = df_equal.randomSplit ([0.8, 0.2])

In [11]:
#   Custom transformers


class PolarityTransformer (Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    
#   The polairtyTransformer uses the package 'TextBlob' to compute the pority of a string, how negative or how 
#   positive a string might be based on the words used. 

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super (PolarityTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        
        def func (string):
            return TextBlob (string).sentiment.polarity

        t = StringType ()
        out_col = self.getOutputCol ()
        in_col = dataset[self.getInputCol ()]
        return dataset.withColumn (out_col, udf (func, t)(in_col).cast ('double'))

    

    
class WordSentimentTransformer (Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):

#   Similar to PolarityTransformer, WordSentimentTransformer uses 'TextBlob' to evaluate the positivity or the 
#   negativity of a word. WordSentimentTransformer examines each word in a string, determines its polarity (positive,
#   negative, or netural), and returns the counts of positive, negative, and neutral words as a vector.
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super (WordSentimentTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        def WordsFreq (array):
            
            posWord = 0
            negWord = 0
            neutWord = 0

            for word in array:
                

                if TextBlob (word).sentiment.polarity > 0:
                    posWord += 1
                elif TextBlob (word).sentiment.polarity < 0:
                    negWord += 1
                else:
                    neutWord += 1

            return posWord, negWord, neutWord

        word_freq = udf (lambda a: Vectors.dense (WordsFreq (a)), VectorUDT ())

        t = ArrayType (StringType ())
        out_col = self.getOutputCol ()
        in_col = dataset[self.getInputCol ()]
        return dataset.withColumn (out_col, word_freq (in_col))

#### RandomForest classifier

In [12]:
tokenizer = RegexTokenizer (inputCol = "reviewText", outputCol = "words", pattern = "\\W", toLowercase = True)

remover = StopWordsRemover (inputCol = tokenizer.getOutputCol (), outputCol = "filtered")

wordsentiment = WordSentimentTransformer (inputCol = remover.getOutputCol (), outputCol = 'word_sentiment')

polarity = PolarityTransformer (inputCol = 'reviewText', outputCol = 'polarity')


assembler = VectorAssembler (inputCols = ['polarity', 'word_sentiment'], outputCol = 'features')


#   model
randforest = RandomForestClassifier (labelCol = 'overall', featuresCol = assembler.getOutputCol (), numTrees = 10, maxMemoryInMB = 1024)

In [13]:
pipeline = Pipeline (stages = [polarity, tokenizer, remover, wordsentiment, assembler, randforest])

In [14]:
rf_model = pipeline.fit (training)
rf_model.save ('models/rf_equalsize_model_amazon')

In [17]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml import PipelineModel

In [21]:
rf_model2 = PipelineModel.load ('models/rf_equalsize_model_amazon')

In [None]:

evaluatorAcc = MulticlassClassificationEvaluator (predictionCol = 'prediction', labelCol = 'overall', metricName = "accuracy")

accuracy = evaluatorAcc.evaluate (rf_model2.transform (testing))
print("Test Error of random forest classifier with balanced dataset: %g" % (1.0 - accuracy))


evaluatorf1 = MulticlassClassificationEvaluator (predictionCol = 'prediction', labelCol = 'overall', metricName = 'f1')
fscore = evaluatorf1.evaluate (rf_model2.transform (testing))
print (f'F1-score of random forest classifier with balanced dataset: {fscore}.')


#### Logistic Regression

In [16]:
logreg = LogisticRegression (labelCol = 'overall', featuresCol = assembler.getOutputCol (), maxIter = 10, family = 'multinomial')

In [17]:
pipeline2 = Pipeline (stages = [polarity, tokenizer, remover, wordsentiment, assembler, logreg])

In [18]:
lr_model = pipeline.fit (training)
lr_model.save ('models/lr_equalsize_model')

In [None]:
accuracy = evaluatorAcc.evaluate (lr_model.transform (testing))
print("Test Error of logistic regression with balanced dataset: %g" % (1.0 - accuracy))

fscore = evaluatorf1.evaluate (lr_model.transform (testing))
print (f'F1-score of logistic regression with balanced dataset: {fscore}.')


#### This code was run on the cpu24 and cpu32-mem systems. Unfortunately, neither run was able to be completed. The run on cpu24 was able to complete the fitting of the random forest classifier and evaluate the accuracy before the program was stopped. It appears the program was interupted because there was not enough memory to run the rest of the program. 

#### The random forest classification model had an accuracy of 65 %. The output is in the file 'amazon_model_fitting_spark_results2.txt'. The program was run with 20 cores on four nodes with 10240 MB of memory per core. The program ran for about seven hours before it was cut off.

In [3]:
#   Output from the run on cpu24

! cat amazon_model_fitting_spark_results2.txt

2020-04-12 01:32:51 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2020-04-12 01:34:57 WARN  WindowExec:66 - No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
There are 75257650 entries in the dataset.
After cleaning there are 62454271 entries remaining.
Test Error of random forest classifier with balanced dataset: 0.654913
Trapped Exit cleaning up Spark Context
Trapped Exit cleaning up Spark Job


## Slurm job

Script file to submit Slurm job

In [None]:
%%file amazon_model_fitting_spark.script
#!/bin/bash
#SBATCH -J slurm-spark
#SBATCH -t 1440 # runtime to request !!! in minutes !!!
#SBATCH -o slurm-spark-%J.log # output extra o means overwrite
#SBATCH -n 1 # requesting n tasks

module load spark/jupyterhub
. /global/software/jupyterhub-spark/anaconda3/etc/profile.d/conda.sh

python amazon_model_fitting_spark.py > amazon_model_fitting_spark_results.txt


#### .py file for fitting of balanced dataset

This code was run on the TALC cpu32-bigmem.

In [None]:
%%file amazon_model_fitting_spark.py

#   Spark 

import os, atexit, sys, findspark, sparkhpc, pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext

findspark.init()

# specify your partition (unless you're OK with default)
os.environ['SBATCH_PARTITION']='cpu32-bigmem'

sj = sparkhpc.sparkjob.sparkjob(
    ncores = 24,                          # total number or cores
    cores_per_executor = 24,              # parallelism of a single executor
    memory_per_core = 1100 * 1024 // 24,  # memory per core in MB
    walltime = "21:0"                      # hh:mm format
)

sj.wait_to_start()
sc = sj.start_spark()
scq = SQLContext(sc)

def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass
atexit.register(exitHandler,sj,sc);


###########################################################################################################################

from textblob import TextBlob

from pyspark.ml.feature import StopWordsRemover

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf, concat
from pyspark.sql import functions as f

from pyspark.ml import Pipeline

from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *

from pyspark.sql.functions import split, lower

from pyspark.ml.feature import NGram, FeatureHasher, CountVectorizer, RegexTokenizer

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler

from itertools import chain
import numpy as np
import pandas as pd


from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable  
from pyspark.sql.functions import udf, create_map, lit
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.import Window

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import time

###########################################################################################################################

start = time.time ()


#   Bring in amazon_reviews.parquet
df = scq.read.parquet ('Files/amazon_reviews.parquet').select ('overall', 'reviewText')

#   Count of entries in the original dataset
print (f'There are {df.count ()} entries in the dataset.')


#   Remove properties with duplicate IDs, remove entries where overall > 0, remove NAs

df_cleaned = df.where (df.overall > 0).dropDuplicates ().na.drop (subset = ['reviewText'])

#   Count of entries in the cleaned data
print (f'After cleaning there are {df_cleaned.count ()} entries remaining.')


#   Determining the fractions to create classes of equal size
df_counts = (df_cleaned.groupBy ('overall')
 .count ()
 .withColumn ('percent', f.col ('count') / f.sum ('count').over (Window.partitionBy ()))
 .orderBy ('overall', ascending = False)
)


df_frac = df_counts.toPandas ()

df_frac['fraction'] = df_frac['count'].apply (lambda x: df_frac['count'].min () / x)

fractions = df_frac[['overall', 'fraction']].set_index ('overall').T.to_dict ('record')[0]

#   Create a dataset with equal sized classes
df_equal = df_cleaned.sampleBy ('overall', fractions = fractions).cache ()


#   Getting training and testing splits
(training, testing) = df_equal.randomSplit ([0.8, 0.2])



##################################################################################################################
#   Custom transformers

class PolarityTransformer (Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super (PolarityTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        
        def func (string):
            return TextBlob (string).sentiment.polarity

        t = StringType ()
        out_col = self.getOutputCol ()
        in_col = dataset[self.getInputCol ()]
        return dataset.withColumn (out_col, udf (func, t)(in_col).cast ('double'))


class WordSentimentTransformer (Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super (WordSentimentTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, dataset):
        def WordsFreq (array):
            
            posWord = 0
            negWord = 0
            neutWord = 0

            for word in array:
                

                if TextBlob (word).sentiment.polarity > 0:
                    posWord += 1
                elif TextBlob (word).sentiment.polarity < 0:
                    negWord += 1
                else:
                    neutWord += 1

            return posWord, negWord, neutWord

        word_freq = udf (lambda a: Vectors.dense (WordsFreq (a)), VectorUDT ())

        t = ArrayType (StringType ())
        out_col = self.getOutputCol ()
        in_col = dataset[self.getInputCol ()]
        return dataset.withColumn (out_col, word_freq (in_col))

##################################################################################################################


#   Set transformers 

tokenizer = RegexTokenizer (inputCol = "reviewText", outputCol = "words", pattern = "\\W", toLowercase = True)

remover = StopWordsRemover (inputCol = tokenizer.getOutputCol (), outputCol = "filtered")

wordsentiment = WordSentimentTransformer (inputCol = remover.getOutputCol (), outputCol = 'word_sentiment')

polarity = PolarityTransformer (inputCol = 'reviewText', outputCol = 'polarity')


assembler = VectorAssembler (inputCols = ['polarity', 'word_sentiment'], outputCol = 'features')


#   Set Random Forest model
randforest = RandomForestClassifier (labelCol = 'overall', featuresCol = assembler.getOutputCol (), numTrees = 10, maxMemoryInMB = 1024)

####################################################################################################################

#   Random Forest model

pipeline = Pipeline (stages = [polarity, tokenizer, remover, wordsentiment, assembler, randforest])

#   Fitting model
rf_model = pipeline.fit (training)

#   Saving model
rf_model.save ('models/rf_equalsize_model_amazon')

#########

#   Evaluating model

evaluatorAcc = MulticlassClassificationEvaluator (predictionCol = 'prediction', labelCol = 'overall', metricName = "accuracy")

accuracy = evaluatorAcc.evaluate (rf_model.transform (testing))
print("Test Error of random forest classifier with balanced dataset: %g" % (1.0 - accuracy))


evaluatorf1 = MulticlassClassificationEvaluator (predictionCol = 'prediction', labelCol = 'overall', metricName = 'f1')

fscore = evaluatorf1.evaluate (rf_model.transform (testing))
print (f'F1-score of random forest classifier with balanced dataset: {fscore}.')

####################################################################################################################

#   Logistic Regression model

logreg = LogisticRegression (labelCol = 'overall', featuresCol = assembler.getOutputCol (), maxIter = 10, family = 'multinomial')

pipeline2 = Pipeline (stages = [polarity, tokenizer, remover, wordsentiment, assembler, logreg])

#   Fitting model
lr_model = pipeline.fit (training)

#   Saving model
lr_model.save ('models/lr_equalsize_model_amazon')

##########

#   Evaluating model

accuracy = evaluatorAcc.evaluate (lr_model.transform (testing))
print("Test Error of logistic regression with balanced dataset: %g" % (1.0 - accuracy))

fscore = evaluatorf1.evaluate (lr_model.transform (testing))
print (f'F1-score of logistic regression with balanced dataset: {fscore}.')

print (f'Program took {time.time () - start} seconds.')


