In [1]:
import pandas as pd 
import re
import string
from datetime import datetime
import findspark 
findspark.init()
import pyspark as ps
from pyspark.sql.types import *
from pyspark.sql.functions import array_contains, col, udf, when, coalesce, array
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, CountVectorizer, IDFModel, StopWordsRemover
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, ParamGridBuilder, TrainValidationSplit

In [2]:
spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName('Model Training') \
            .getOrCreate()

In [3]:
schema = StructType([
        StructField('NewIndex' , IntegerType(), False),
        StructField('Index' , IntegerType(), False),
        StructField('ItemID' , IntegerType(), False),
        StructField('Sentiment' , IntegerType(), False),
        StructField('SentimentSource' , StringType(), False),
        StructField('SentimentText' , StringType(), False),
    ])

In [4]:
filename = "../../data/nokaggle-Sentiment-Analysis-Dataset.csv"
users = spark.read.csv(filename, header=True, schema=schema)
users.printSchema()
users.show(5)

AnalysisException: 'Path does not exist: file:/home/ubuntu/capstone/data/nokaggle-Sentiment-Analysis-Dataset.csv;'

In [7]:
def preprocess(text):
    words = re.sub("[^a-zA-Z]", " ", text).lower().split()
    return words

In [8]:
pp_udf = udf(preprocess, ArrayType(StringType()))
words = users.withColumn('Words', pp_udf(users.SentimentText))
words.printSchema()

root
 |-- NewIndex: integer (nullable = true)
 |-- Index: integer (nullable = true)
 |-- ItemID: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- SentimentSource: string (nullable = true)
 |-- SentimentText: string (nullable = true)
 |-- Words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [9]:
#remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
removed = remover.transform(words)
removed.show()

+--------+-----+------+---------+---------------+--------------------+--------------------+--------------------+
|NewIndex|Index|ItemID|Sentiment|SentimentSource|       SentimentText|               Words|            filtered|
+--------+-----+------+---------+---------------+--------------------+--------------------+--------------------+
|       0|    0|     1|        0|   Sentiment140|                 ...|[is, so, sad, for...|  [sad, apl, friend]|
|       1|    1|     2|        0|   Sentiment140|                 ...|[i, missed, the, ...|[missed, new, moo...|
|       2|    2|     3|        1|   Sentiment140|              omg...|[omg, its, alread...|      [omg, already]|
|       3|    3|     4|        0|   Sentiment140|          .. Omga...|[omgaga, im, sooo...|[omgaga, im, sooo...|
|       4|    4|     5|        0|   Sentiment140|         i think ...|[i, think, mi, bf...|[think, mi, bf, c...|
|       5|    5|     6|        0|   Sentiment140|         or i jus...|[or, i, just, wor...|     

In [10]:
filtered = removed.select('ItemID', col('Sentiment').alias('label'), 'SentimentText', 'Words', 'filtered')
filtered.show()

+------+-----+--------------------+--------------------+--------------------+
|ItemID|label|       SentimentText|               Words|            filtered|
+------+-----+--------------------+--------------------+--------------------+
|     1|    0|                 ...|[is, so, sad, for...|  [sad, apl, friend]|
|     2|    0|                 ...|[i, missed, the, ...|[missed, new, moo...|
|     3|    1|              omg...|[omg, its, alread...|      [omg, already]|
|     4|    0|          .. Omga...|[omgaga, im, sooo...|[omgaga, im, sooo...|
|     5|    0|         i think ...|[i, think, mi, bf...|[think, mi, bf, c...|
|     6|    0|         or i jus...|[or, i, just, wor...|       [worry, much]|
|     7|    1|       Juuuuuuuuu...|[juuuuuuuuuuuuuuu...|[juuuuuuuuuuuuuuu...|
|     8|    0|       Sunny Agai...|[sunny, again, wo...|[sunny, work, tom...|
|     9|    1|      handed in m...|[handed, in, my, ...|[handed, uniform,...|
|    10|    1|      hmmmm.... i...|[hmmmm, i, wonder...|[hmmmm, 

In [11]:
#cast empty array 

fill = array().cast("array<string>")
filtered_filled = coalesce(col("filtered"), fill)
filled_featurized = filtered.withColumn("filtered", filtered_filled)
filled_featurized.printSchema()

root
 |-- ItemID: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- SentimentText: string (nullable = true)
 |-- Words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [12]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=200)
featurized = hashingTF.transform(filled_featurized)
featurized.select('rawFeatures').show(truncate=False)

+-----------------------------------------------------------------------------------------------------+
|rawFeatures                                                                                          |
+-----------------------------------------------------------------------------------------------------+
|(200,[38,60,193],[1.0,1.0,1.0])                                                                      |
|(200,[15,25,81,196],[1.0,1.0,1.0,1.0])                                                               |
|(200,[57,184],[1.0,1.0])                                                                             |
|(200,[18,56,73,82,102,132,150,155,159,166,169,185],[1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(200,[72,164,165,169],[1.0,1.0,1.0,1.0])                                                             |
|(200,[124,162],[1.0,1.0])                                                                            |
|(200,[9,190],[1.0,1.0])                                        

In [14]:
featurized.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
model = idf.fit(featurized)
result = model.transform(featurized)
result.limit(10).select('features').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                          |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(200,[38,60,193],[3.351292892234358,2.7425253655359136,3.2069298628197354])                                                                                             

In [15]:
#save idf and idf model
idf_path = '../tmp/idf'
idf.save(idf_path)
idfmodel_path = '../tmp/idfmodel'
model.save(idfmodel_path)
#load via following
#loadedModel = IDFModel.load(idfmodel_path)

In [39]:
%%time
rf = RandomForestClassifier(numTrees=100, labelCol="label", seed=42)
rf_model = rf.fit(result)

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 1min 49s


In [45]:
%%time
#Prepare Train Test Split
train, test = result.randomSplit([0.8, 0.2], seed=42)

# Configure an ML pipeline, which consists of tree stages: hashingTF, idf and RandomForestClassifier.
#remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
#hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label", seed=42)
pipeline = Pipeline(stages=[rf])

#grid search
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [100]).addGrid(rf.maxDepth, [5]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("SentimentText", "probability", "prediction")
selected.printSchema()
selected.show(5)

root
 |-- SentimentText: string (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

+--------------------+--------------------+----------+
|       SentimentText|         probability|prediction|
+--------------------+--------------------+----------+
|                 ...|[0.50464022409798...|       0.0|
|              omg...|[0.49997696970661...|       1.0|
|          .. Omga...|[0.51373803403465...|       0.0|
|     jb isnt show...|[0.49997696970661...|       1.0|
|    awhhe man.......|[0.48503937730438...|       1.0|
+--------------------+--------------------+----------+
only showing top 5 rows

CPU times: user 220 ms, sys: 48 ms, total: 268 ms
Wall time: 7min 18s


In [50]:
print (cvModel.avgMetrics)

[0.645381227448802]
