# DATA PROCESSING

In [2]:
import findspark
findspark.init()

#System.setProperty("hadoop.home.dir", "E:\software\spark-3.0.0-preview2-bin-hadoop2.7\spark-3.0.0-preview2-bin-hadoop2.7\winutils.exe");


In [3]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

EXE_MEMORY = "2g"
DRIVER_MEMORY = "8g"
spark = SparkSession.builder.appName("AWSNLP").config("spark.executor.memory", EXE_MEMORY).config("spark.executor.cores", "2").config("spark.driver.memory", DRIVER_MEMORY).config("spark.cores.max", "10").getOrCreate()

In [4]:
df = spark.read.json('C:\\Users\\salon\\Downloads\\Magazine_Subscriptions.json')

In [5]:
df=df.select('reviewText', "overall")

In [6]:
from pyspark.sql.window import Window as W
from pyspark.sql import functions as F
df = df.withColumn("idx", F.monotonically_increasing_id())
windowSpec = W.orderBy("idx")
df.withColumn("idx", F.row_number().over(windowSpec)).show(2)

+--------------------+-------+---+
|          reviewText|overall|idx|
+--------------------+-------+---+
|for computer enth...|    5.0|  1|
|Thank god this is...|    5.0|  2|
+--------------------+-------+---+
only showing top 2 rows



## PUNCTUATION REMOVAL

In [7]:
from pyspark.sql.functions import regexp_replace, trim, col, lower
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return trim(lower(regexp_replace(column, '[^\sa-zA-Z0-9]', ''))).alias('reviewText')

df=df.select("IDX", "overall", (removePunctuation(col('reviewText'))))
         


## REMOVING NULL VALUES

Null values removed from dataset where reviewText had no texts.

In [8]:
df=df.where(df.reviewText.isNotNull())

## DATA PROCESSING

Dataset processed here by tokenizing, removing stop words(list of stopwords removed is given at last), lemmatization, stemming and removing words that were of length 3.

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import *
from nltk import WordNetLemmatizer


# Tokenize text
tokenizer = Tokenizer(inputCol='reviewText', outputCol='words_token')
df_words_token = tokenizer.transform(df).select('IDX',"overall", 'words_token')

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_words_no_stopw = remover.transform(df_words_token).select('IDX',"overall", 'words_clean')

#lemmatization
lemm=WordNetLemmatizer()
lemm_udf=udf(lambda tokens:[lemm.lemmatize(token) for token in tokens], ArrayType(StringType()))
df_lemm = df_words_no_stopw.withColumn("lemmi", lemm_udf("words_clean")).select('IDX',"overall", 'lemmi')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_lemm.withColumn("words_stemmed", stemmer_udf("lemmi")).select('IDX',"overall",'words_stemmed')


# Filter length word > 3
filter_length_udf = udf(lambda row: [x for x in row if len(x) > 3], ArrayType(StringType()))
df_final_words = df_stemmed.withColumn('words', filter_length_udf(col('words_stemmed'))).select('IDX',"overall", 'words')

## All the dataframes after performing subsequent actions.

Here are only showing 5 values of each.

In [10]:
df.show(5)
df_words_token.show(5)
df_words_no_stopw.show(5)
df_lemm.show(5)
df_stemmed.show(5)
df_final_words.show(5)

+---+-------+--------------------+
|IDX|overall|          reviewText|
+---+-------+--------------------+
|  0|    5.0|for computer enth...|
|  1|    5.0|thank god this is...|
|  2|    3.0|antiques magazine...|
|  3|    5.0|this beautiful ma...|
|  4|    5.0|a great read ever...|
+---+-------+--------------------+
only showing top 5 rows

+---+-------+--------------------+
|IDX|overall|         words_token|
+---+-------+--------------------+
|  0|    5.0|[for, computer, e...|
|  1|    5.0|[thank, god, this...|
|  2|    3.0|[antiques, magazi...|
|  3|    5.0|[this, beautiful,...|
|  4|    5.0|[a, great, read, ...|
+---+-------+--------------------+
only showing top 5 rows

+---+-------+--------------------+
|IDX|overall|         words_clean|
+---+-------+--------------------+
|  0|    5.0|[computer, enthus...|
|  1|    5.0|[thank, god, ziff...|
|  2|    3.0|[antiques, magazi...|
|  3|    5.0|[beautiful, magaz...|
|  4|    5.0|[great, read, eve...|
+---+-------+--------------------+
only 

In [7]:
from pyspark.ml.feature import StopWordsRemover
# Define a list of stop words or use default 
remover = StopWordsRemover()
stopwords = remover.getStopWords() # Display default 
print(stopwords[:200])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no