**Reddit** sentiment analysis and prediction using pyspark **REGRESSION MODEL**









In [4]:
from IPython import display
import math
import pandas as pd
import numpy as np
import pyspark

from pyspark.sql import SQLContext
from pyspark import SparkContext

from pyspark.sql.types import *

In [5]:
sc =SparkContext()
sqlContext = SQLContext(sc)



In [6]:
customSchema = StructType([
    StructField("clean_text", StringType()), 
    StructField("category", StringType())])

In [11]:
%cd /content/drive/MyDrive/BDMA/
!ls

/content/drive/MyDrive/BDMA
Reddit_Uncleaned.csv  redt_dataset.csv	Tweets_Uncleaned.csv


In [16]:

!pwd
filename = '/content/drive/MyDrive/BDMA//redt_dataset.csv'
df = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load(filename)
df.show()

/content/drive/My Drive/BDMA
+--------------------+--------+
|          clean_text|category|
+--------------------+--------+
| understand that ...|       1|
|welcome depressio...|       1|
| don’ want kill m...|       1|
|’ been very depre...|       1|
|everything seemed...|       1|
| just really need...|      -1|
|but instead devel...|       1|
|you just laughed ...|       1|
| really hope heav...|      -1|
| lost grandfather...|      -1|
| have zero motiva...|      -1|
| mental health ha...|      -1|
|that’ that’ the p...|       0|
| want die not rea...|      -1|
| feel like depres...|      -1|
| can afford shit ...|      -1|
| let people know ...|       1|
| fucking sick and...|      -1|
| cant cant out fo...|       1|
|nowadays feel lik...|      -1|
+--------------------+--------+
only showing top 20 rows



In [18]:
data = df.na.drop(how='any')
data.show(5)

+--------------------+--------+
|          clean_text|category|
+--------------------+--------+
| understand that ...|       1|
|welcome depressio...|       1|
| don’ want kill m...|       1|
|’ been very depre...|       1|
|everything seemed...|       1|
+--------------------+--------+
only showing top 5 rows



In [19]:
data.printSchema()


root
 |-- clean_text: string (nullable = true)
 |-- category: string (nullable = true)



In [21]:
from pyspark.sql.functions import col

data.groupBy("category").count().orderBy(col("count").desc()).show()

+--------+-----+
|category|count|
+--------+-----+
|      -1|  470|
|       1|  459|
|       0|   43|
+--------+-----+



*Model Pipeline*










In [22]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="clean_text", outputCol="words", pattern="\\W")

# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=15000, minDF=5)

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------+--------------------+--------------------+--------------------+-----+
|          clean_text|category|               words|            filtered|            features|label|
+--------------------+--------+--------------------+--------------------+--------------------+-----+
| understand that ...|       1|[understand, that...|[understand, that...|(1890,[0,1,2,3,4,...|  1.0|
|welcome depressio...|       1|[welcome, depress...|[welcome, depress...|(1890,[0,1,2,3,4,...|  1.0|
| don’ want kill m...|       1|[don, want, kill,...|[don, want, kill,...|(1890,[0,3,4,5,9,...|  1.0|
|’ been very depre...|       1|[been, very, depr...|[been, very, depr...|(1890,[0,1,2,3,5,...|  1.0|
|everything seemed...|       1|[everything, seem...|[everything, seem...|(1890,[0,1,2,3,4,...|  1.0|
+--------------------+--------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



*Partition Training & Test sets / Model Training and Evaluation*

In [24]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 684
Test Dataset Count: 288


In [25]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0).select("clean_text","category","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                    clean_text|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| can live with the guilt an...|      -1|[0.9762245509655013,0.02121...|  0.0|       0.0|
|first sorry for bad english...|      -1|[0.9521293975302003,0.04348...|  0.0|       0.0|
| finally starting recover f...|      -1|[0.9472628630783014,0.04669...|  0.0|       0.0|
|writing this after another ...|       1|[0.9454040410045139,0.05121...|  1.0|       0.0|
| been poor whole life finis...|      -1|[0.9303841948252422,0.04527...|  0.0|       0.0|
|this reddit post asking for...|      -1|[0.9193142814358951,0.07917...|  0.0|       0.0|
|’ tired don’ complain becau...|       1|[0.9031143454381103,0.08881...|  1.0|       0.0|
| feel held hostage unfortun...|      -1|[0.8894822482777025,0.10991...|  0.0|       0.0|
| all long

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6338999788762146

*Logistic Regression using TF-IDF Features*

In [27]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("clean_text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                    clean_text|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| can live with the guilt an...|      -1|[0.9883120311720182,0.00988...|  0.0|       0.0|
|first sorry for bad english...|      -1|[0.9522443627753939,0.04354...|  0.0|       0.0|
| all long time lurker first...|      -1|[0.9462319038715808,0.04594...|  0.0|       0.0|
| been poor whole life finis...|      -1|[0.9365868334925681,0.04368...|  0.0|       0.0|
| been stuck tearing for hou...|      -1|[0.9365348578680442,0.04847...|  0.0|       0.0|
|writing this after another ...|       1|[0.9358110650866001,0.05933...|  1.0|       0.0|
| finally starting recover f...|      -1|[0.9353617910829934,0.05799...|  0.0|       0.0|
| have been unemployed for q...|       1|[0.9092383972026635,0.08695...|  1.0|       0.0|
| swear th

In [28]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.613804959930691

*Cross-Validation*

In [29]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)