# NLTK for performing NLP Sentiment Analysis

In [37]:
pip install nltk



# Prerequisties and Libraries Required 

In [0]:
from pyspark.sql.functions import col, lit
from functools import reduce
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
import pandas as pd
import re
import string

In [39]:
data_tokens.withColumn("doc_id", monotonically_increasing_id())

DataFrame[words_clean: array<string>, doc_id: bigint]

In [40]:
data_tokens.show(5)

+--------------------+
|         words_clean|
+--------------------+
|[ive, read, book,...|
|[nicely, written,...|
|              [love]|
|[good, additional...|
|[gazillion, patte...|
+--------------------+
only showing top 5 rows



# Preparing for Machine learning 

# TF - IDF Matrix

In [0]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


In [51]:
#Data with overall scores
data_clean.show(5)

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0|I've read this bo...|
|    5.0|Nicely written di...|
|    5.0|             love it|
|    5.0|Good additional k...|
|    5.0|A gazillion patte...|
+-------+--------------------+
only showing top 5 rows



In [0]:
data_clean1 = data_clean 

   # SENTIMENT ANALYSIS 
   
###   Sentiment analysis or sentiment classification fall into the broad category of text classification tasks where you are supplied with a phrase, or a list of phrases and your classifier is supposed to tell if the sentiment behind that is positive, negative or neutral. Sometimes, the third attribute is not taken to keep it a binary classification problem. In recent tasks, sentiments like "somewhat positive" and "somewhat negative" are also being considered.

# Understading the reviewText based on positive, negative or neutral emotion of the sentence. Here we bifurcate the rating score based on 
# Score > 3 stars as positive, 
# Score < 3 star as negative and 
# Score = 3 star as neutral.

In [58]:
df1 = data_clean1.withColumn("sentiment",when(col("overall") > '3.0' , 'Positive Review')
    .otherwise(when(col("overall") == '3.0', 'Neutral Review')
        .otherwise('Negative Review')))

+-------+--------------------+---------------+
|overall|          reviewText|      sentiment|
+-------+--------------------+---------------+
|    5.0|I've read this bo...|Positive Review|
|    5.0|Nicely written di...|Positive Review|
|    5.0|             love it|Positive Review|
|    5.0|Good additional k...|Positive Review|
|    5.0|A gazillion patte...|Positive Review|
|    2.0|Just ok. Read bet...|Negative Review|
|    5.0|The best knitting...|Positive Review|
|    5.0|This book is a mo...|Positive Review|
|    5.0|excellent variety...|Positive Review|
|    5.0|Another winner M....|Positive Review|
|    5.0|Love all the patt...|Positive Review|
|    4.0|Good selection of...|Positive Review|
|    4.0|Contains some int...|Positive Review|
|    5.0|Super useful for ...|Positive Review|
|    5.0|Loved it. Differe...|Positive Review|
|    5.0|I love the book b...|Positive Review|
|    5.0|Awesome book!!! A...|Positive Review|
|    5.0|   Lots of stitches.|Positive Review|
|    5.0|Easy

In [0]:
dt2 = data_clean1.withColumn("sentiment",when(col("overall") > '3.0' , 'Positive Review')
    .otherwise(when(col("overall") == '3.0', 'Neutral Review')
        .otherwise('Negative Review')))

## DIVIDING THE DATA INTO TRAIN, TEST AND VALIDATION SET TO TRAIN OUR MODEL.

In [0]:
(train_set, val_set, test_set) = dt2.randomSplit([0.98, 0.01, 0.01], seed = 2000)

# TF - IDF MODEL

### Tf-idf stands for term frequency-inverse document frequency, and the tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. Tokenizer will tokenize the sentenses into tokens and then the model performs the TD and IDF methods storing them in a pipeline.

In [67]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+-------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|overall|          reviewText|      sentiment|               words|                  tf|            features|label|
+-------+--------------------+---------------+--------------------+--------------------+--------------------+-----+
|    1.0|

I'm like the ot...|Negative Review|[, , i'm, like, t...|(65536,[4492,5887...|(65536,[4492,5887...|  2.0|
|    1.0|
Very disappointe...|Negative Review|[, very, disappoi...|(65536,[4917,8026...|(65536,[4917,8026...|  2.0|
|    1.0|       .
 ..
, .....|Negative Review|[, , , , , , , .,...|(65536,[1536,1438...|(65536,[1536,1438...|  2.0|
|    1.0|  from Dorian on ...|Negative Review|[, , from, dorian...|(65536,[14,158,18...|(65536,[14,158,18...|  2.0|
|    1.0| Flimsy, metal is...|Negative Review|[, flimsy,, metal...|(65536,[1053,2712...|(65536,[1053,2712...|  2.0|
+-------+--------------------+---------------+--------------------+-----

## Model Training and Evaluation
### Logistic Regression using Count Vector Features
### Our model will make predictions and score on the test set; we then look at the top 10 predictions from the highest probability.

In [0]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

### Prediction | Model Accuracy

In [75]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy

92.87384758923

In [78]:
%%time
from pyspark.ml.feature import CountVectorizer

tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))


Accuracy Score: 92.8738
ROC-AUC: 0.9678




### Let’s now try cross-validation to tune our hyper parameters, and we will only tune the count vectors Logistic Regression.


In [83]:


pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(dt2)
dataset = pipelineFit.transform(dt2)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(train_set)

predictions = cvModel.transform(test_set)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)
print(evaluator)

0.95836473837


## Accuracy Improved