# Assignment 3 - Steps 2 and 3: Text preprocessing and construction of predictive model


Group 11:
- Lisa Driessen - r0675727
- Laura Fernández López - r0877908
- Silvia María Goñi Mendia - r0877434
- Peter Day - r0866276

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, StopWordsRemover, StringIndexer, HashingTF, IDF, Word2Vec
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession \
    .builder \
    .appName('A3-2') \
    .getOrCreate()

In [3]:
spark

## Import data and assign target

In [35]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("messages.csv"))

In [36]:
df_test = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("messages_test.csv"))

In [37]:
df.show(5, False)
df_test.show(5, False)

+----------------+----------+-------------------------------------------------------------+
|username        |channel   |message                                                      |
+----------------+----------+-------------------------------------------------------------+
|fossabot        |#loltyler1|@KuniGaru, ACCOUNTS: BIG TONKA T is currently Master I 111 LP|
|xsageone        |#jinnytty |exbcSpy                                                      |
|widedomi        |#jinnytty |3Head                                                        |
|skysage         |#jinnytty |no                                                           |
|przemkowsky_wawa|#jinnytty |HYPERS                                                       |
+----------------+----------+-------------------------------------------------------------+
only showing top 5 rows

+------------------+----------+---------------------------------------------------------+
|username          |channel   |message                   

## Step 2: Text preprocessing

In [38]:
# Create label column
df = StringIndexer(inputCol='channel', outputCol='label', handleInvalid='keep').fit(df).transform(df)
df_test = StringIndexer(inputCol='channel', outputCol='label', handleInvalid='keep').fit(df_test).transform(df_test)

In [39]:
# Create column with both the username and the message (Not used)
df = df.withColumn('message0', f.concat(f.col('username'),f.lit(' '),f.col('message')))
df_test = df_test.withColumn('message0', f.concat(f.col('username'),f.lit(' '),f.col('message')))

In [40]:
# Remove symbols
df = df.withColumn("words1", f.regexp_replace(f.col("message0"), "[\$#,<>+@=?!]", ""))
df_test = df_test.withColumn("words1", f.regexp_replace(f.col("message0"), "[\$#,<>+@=?!]", ""))

# Remove extra spaces
df = df.withColumn("words2", f.regexp_replace(f.col("words1"), "  +", " "))
df_test = df_test.withColumn("words2", f.regexp_replace(f.col("words1"), "  +", " "))

# Remove missing values
df = df.dropna()
df_test = df_test.dropna()

# Tokenize
tokenizer = Tokenizer(inputCol="words2", outputCol="words3")
df = tokenizer.transform(df)
df_test = tokenizer.transform(df_test)

# Remove stopwords
remover = StopWordsRemover(inputCol="words3", outputCol="words")
df = remover.transform(df)
df_test = remover.transform(df_test)

df = df.drop(*("username", "channel", "message", "words1", "words2", "words3"))
df_test = df_test.drop(*("username", "channel", "message", "words1", "words2", "words3"))

In [41]:
df.show(5, False)
df_test.show(5, False)

+-----+----------------------------------------------------------------------+-----------------------------------------------------------------------+
|label|message0                                                              |words                                                                  |
+-----+----------------------------------------------------------------------+-----------------------------------------------------------------------+
|1.0  |fossabot @KuniGaru, ACCOUNTS: BIG TONKA T is currently Master I 111 LP|[fossabot, kunigaru, accounts:, big, tonka, currently, master, 111, lp]|
|0.0  |xsageone exbcSpy                                                      |[xsageone, exbcspy]                                                    |
|0.0  |widedomi 3Head                                                        |[widedomi, 3head]                                                      |
|0.0  |skysage no                                                            |[skysage]       

## Step 3: Predictive model

### TF-IDF

In [42]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(df)
featurizedData_test = hashingTF.transform(df_test)

idf = IDF(inputCol="rawFeatures", outputCol="tfidf")
idfModel = idf.fit(featurizedData)
df = idfModel.transform(featurizedData)

df_test = idfModel.transform(featurizedData_test)

df = df.drop("rawFeatures")
df_test = df_test.drop("rawFeatures")

### Word2Vec

In [43]:
word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="w2v")
w2vmodel = word2Vec.fit(df)

df = w2vmodel.transform(df)
df_test = w2vmodel.transform(df_test)

In [44]:
df.show(5)
df_test.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            message0|               words|               tfidf|                 w2v|
+-----+--------------------+--------------------+--------------------+--------------------+
|  1.0|fossabot @KuniGar...|[fossabot, kuniga...|(20,[4,6,7,8,19],...|[1.27302299605475...|
|  0.0|    xsageone exbcSpy| [xsageone, exbcspy]|(20,[9,19],[2.033...|[-0.0458895324263...|
|  0.0|      widedomi 3Head|   [widedomi, 3head]|(20,[5,14],[2.214...|[0.11197673529386...|
|  0.0|          skysage no|           [skysage]|(20,[3],[2.001902...|[-0.7380989193916...|
|  0.0|przemkowsky_wawa ...|[przemkowsky_wawa...|(20,[4,9],[1.8839...|[-0.1244265735149...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            message0|               words|      

In [45]:
# Split data in train and test sets
df_train, df_val = df.randomSplit([0.7, 0.3], seed = 234)
print("Training Dataset Count: " + str(df_train.count()))
print("Validation Dataset Count: " + str(df_val.count()))
print("Test Dataset Count: " + str(df_test.count()))

Training Dataset Count: 52864
Validation Dataset Count: 22447
Test Dataset Count: 31702


### Logistic Regression for tf-idf

In [46]:
# Apply logistic regression model
lr_tfidf = LogisticRegression(featuresCol = 'tfidf', labelCol='label').fit(df_train)

# Train Results
train_results_tfidf = lr_tfidf.evaluate(df_train).predictions
train_results_tfidf.select(['label','prediction','probability']).show(5,False)

evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(train_results_tfidf)
print("Train set accuracy = " + str(accuracy))

# Validation Results
val_lr_tfidf = lr_tfidf.transform(df_val)
val_lr_tfidf.select(['label','prediction','probability']).show(5, False)

accuracy = evaluator_acc.evaluate(val_lr_tfidf)
print("Validation set accuracy = " + str(accuracy))

# Test Results
pred_lr_tfidf = lr_tfidf.transform(df_test)
pred_lr_tfidf.select(['label','prediction','probability']).show(5, False)

accuracy = evaluator_acc.evaluate(pred_lr_tfidf)
print("Test set accuracy = " + str(accuracy))

+-----+----------+---------------------------------------------------------------+
|label|prediction|probability                                                    |
+-----+----------+---------------------------------------------------------------+
|0.0  |0.0       |[0.7054291377886421,0.2945708622034997,7.858186408890208E-12]  |
|0.0  |1.0       |[0.49013392711900167,0.5098660728714622,9.53620387867034E-12]  |
|0.0  |1.0       |[0.4531347354374585,0.5468652645507329,1.1808612580289882E-11] |
|0.0  |0.0       |[0.6886984843661735,0.3113015156243836,9.442982621421199E-12]  |
|0.0  |0.0       |[0.6316728118202544,0.36832718816903715,1.0708555048481399E-11]|
+-----+----------+---------------------------------------------------------------+
only showing top 5 rows

Train set accuracy = 0.6367660411622276
+-----+----------+--------------------------------------------------------------+
|label|prediction|probability                                                   |
+-----+----------+------

### Logistic Regression for Word2Vec

In [47]:
# Apply logistic regression model
lr_w2v = LogisticRegression(featuresCol = 'w2v', labelCol='label').fit(df_train)

# Train Results
train_results_w2v = lr_w2v.evaluate(df_train).predictions
train_results_w2v.select(['label','prediction','probability']).show(5, False)

evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(train_results_w2v)
print("Train set accuracy = " + str(accuracy))

# Validation Results
val_lr_w2v = lr_w2v.transform(df_val)
val_lr_w2v.select(['label','prediction','probability']).show(5, False)

accuracy = evaluator_acc.evaluate(val_lr_w2v)
print("Test set accuracy = " + str(accuracy))

# Test Results
pred_lr_w2v = lr_w2v.transform(df_test)
pred_lr_w2v.select(['label','prediction','probability']).show(5, False)

accuracy = evaluator_acc.evaluate(pred_lr_w2v)
print("Test set accuracy = " + str(accuracy))

+-----+----------+---------------------------------------------------------------+
|label|prediction|probability                                                    |
+-----+----------+---------------------------------------------------------------+
|0.0  |0.0       |[0.7927476905908345,0.20725230940875863,4.0688827966079027E-13]|
|0.0  |0.0       |[0.7541796706691013,0.24582032932888845,2.010404189421244E-12] |
|0.0  |0.0       |[0.9881052292058464,0.011894770763522123,3.063144260542358E-11]|
|0.0  |0.0       |[0.7451198215158286,0.2548801784822799,1.891342121408406E-12]  |
|0.0  |0.0       |[0.9677191430591785,0.032280856939991706,8.298317777190093E-13]|
+-----+----------+---------------------------------------------------------------+
only showing top 5 rows

Train set accuracy = 0.8130864104116223
+-----+----------+---------------------------------------------------------------+
|label|prediction|probability                                                    |
+-----+----------+----

### Naive Bayes with tf-idf

In [48]:
# Create trainer and set parameters
nb_tfidf = NaiveBayes(featuresCol = 'tfidf', labelCol='label', smoothing=1.0).fit(df)

train_nb_tfidf = nb_tfidf.transform(df_train)
train_nb_tfidf.select(['label','prediction','probability']).show(5, False)
# Compute accuracy on train set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(train_nb_tfidf)
print("Train set accuracy = " + str(accuracy))

val_nb_tfidf = nb_tfidf.transform(df_val)
val_nb_tfidf.select(['label','prediction','probability']).show(5, False)
# Compute accuracy on validation set
accuracy = evaluator.evaluate(val_nb_tfidf)
print("Validation set accuracy = " + str(accuracy))

pred_nb_tfidf = nb_tfidf.transform(df_test)
pred_nb_tfidf.select(['label','prediction','probability']).show(5, False)
# Compute accuracy on test set
accuracy = evaluator.evaluate(pred_nb_tfidf)
print("Test set accuracy = " + str(accuracy))

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |0.0       |[0.6903376430998165,0.3096623569001834] |
|0.0  |1.0       |[0.4607557609730969,0.5392442390269031] |
|0.0  |1.0       |[0.2812955686549889,0.7187044313450112] |
|0.0  |0.0       |[0.9686056315004665,0.03139436849953342]|
|0.0  |0.0       |[0.6768611349646294,0.32313886503537065]|
+-----+----------+----------------------------------------+
only showing top 5 rows

Train set accuracy = 0.6108126513317191
+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |0.0       |[0.5286629730643598,0.47133702693564017]|
|0.0  |0.0       |[0.5688847699936724,0.4311152300063277] |
|0.0  |0.0       |[0.6719160932473748,0.32808390675262517]|
|0.0  |1.0       |[0.44839946067819

## Save model

In [49]:
lr_w2v.save("A3.model")