# Teodor Chakarov, id: 12141198


In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover, StringIndexer, HashingTF, IDF 
from pyspark.ml.feature import ChiSqSelector, PCA
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, array_contains, array_remove
from pyspark.sql.functions import regexp_replace, trim, col, lower, concat
import re
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from array import array
from pyspark.sql import DataFrame
from pyspark.ml.feature import PCA
from pyspark.rdd import RDD

from pyspark.ml.feature import Normalizer 
from pyspark.ml.classification import LinearSVC, OneVsRest, LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [4]:
conf=pyspark.SparkConf().setAppName('SparkApp1').setMaster('local')
sc=pyspark.SparkContext(conf=conf)
spark=SparkSession(sc)

# Reading the data and preprocess

Here from all the data I am selecting the needed columns **reviewText** and **category**

In [174]:
df = spark.read.json("reviews_devset.json")
df_selected = df.select('reviewText', 'category')

In [216]:
df_selected.take(2)

[Row(reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", category='Patio_Lawn_and_Garde'),
 Row(reviewText='This is a very nice spreader.  It feels very solid and the pneumatic tires give it great maneuverability and handling over bumps.  The control arm is solid metal, not a cable, which gives you precise control and will last a long time.  The settings take some experimentation with your various products to get it right, but that is true of any spreader.  It has good distribution... probably flings material a little farther on the right side than th

We can see, we have two records with the review and the category

## Building the preprocess pipeline

In [175]:
regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words") # tokenizer
stopwords  = StopWordsRemover.loadDefaultStopWords("english")             # loading the stopwords
remover = StopWordsRemover(inputCol = "words", outputCol = "filtered", stopWords=stopwords)    # removing the stopwords from english
hashingTF = HashingTF(inputCol="filtered", outputCol="tfs")               # calculating the therm frequencies 
idf = IDF(inputCol='tfs', outputCol='idfs', minDocFreq=1)                 # calculatinf th eidfs values for all documents
stringIndexer = StringIndexer(inputCol="category", outputCol="labeled")   # Label encoder
selector = ChiSqSelector(numTopFeatures=2000, featuresCol="idfs",         # chi_value selector top 2000 values
                         outputCol="selectedFeatures", labelCol="labeled")


Initializing the pipeline

In [176]:
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, stringIndexer, selector])
df_after = pipeline.fit(df_selected)
df_after = df_after.transform(df_selected)

In [218]:
df_after.take(1)

[Row(reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", category='Patio_Lawn_and_Garde', words=['this', 'was', 'a', 'gift', 'for', 'my', 'other', 'husband.', "he's", 'making', 'us', 'things', 'from', 'it', 'all', 'the', 'time', 'and', 'we', 'love', 'the', 'food.', 'directions', 'are', 'simple,', 'easy', 'to', 'read', 'and', 'interpret,', 'and', 'fun', 'to', 'make.', 'we', 'all', 'love', 'different', 'kinds', 'of', 'cuisine', 'and', 'raichlen', 'provides', 'recipes', 'from', 'everywhere', 'along', 'the', 'barbecue', 'trail', 'as', 'he', 'calls', 'it.'

Here is the output of the first record after the preprocess. We have those steps:
- reviewText, category
- words (tokenized)
- filtered (stopwords)
- tf
- idf
- labeled - Label encoded
- chi selector

For the chi selector we have 2000 long vector with chi values of the words. Sparse vector represents not the 0 values. We have on which position we have value > 0

In [22]:
df_pd.iloc[2]

selectedFeatures    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
labeled                                                          18.0
Name: 2, dtype: object

### Extracting as txt file

In [23]:
def extract_values_from_vector(vector):
    return vector.values.tolist()

def extract_keys_from_vector(vector):
    return vector.indices.tolist()

In [24]:
li = {}

for i in range(78829):
    curdf = df_pd.iloc[i]
    if (str(curdf.labeled) not in li.keys()):
        li[f"{curdf.labeled}"] = {}
    for m, k in enumerate(extract_keys_from_vector(curdf.selectedFeatures)):
        if (k not in li[f"{curdf.labeled}"].keys()):
            li[f"{curdf.labeled}"][k] = extract_values_from_vector(curdf.selectedFeatures)[m]
            

In [None]:
textfile = open("output_ds.txt", "w")
for key,el in li.items():
        textfile.write(f"{key, el}" + "\n")
textfile.close()

# Machine Learning


Splitting the data to train, test and val

In [74]:
train_data = df_after_1.select("selectedFeatures", "labeled")

In [75]:
(train, next_data) = train_data.randomSplit([0.7,0.3], seed=24)
(val, test) = next_data.randomSplit([0.8,0.2], seed=24)

In [77]:
print((train.count(), len(train.columns)))

(55097, 2)


In [78]:
print((val.count(), len(val.columns)))

(18993, 2)


In [79]:
print((test.count(), len(test.columns)))

(4739, 2)


## Building the ML pipeline

In [32]:
normalizer = Normalizer(inputCol="selectedFeatures", outputCol="normFeatures", p=1.0)  # normaliznig the features (chi values)
svm = LinearSVC(labelCol="labeled")                                                    # Linear Support Vector Machines
ovr = OneVsRest(classifier=svm, featuresCol='normFeatures',labelCol='labeled')         # Multiclass classification method

In [33]:
pipeline2 = Pipeline(stages = [normalizer, ovr])

### Support Vector Machines

Building Cross-validation technnique - for preventing overfitting and also hyper-paramether search 

In [35]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0, 0.1]) \
    .build()

crossval = CrossValidator(estimator=pipeline2,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="labeled"),
                          numFolds=5)

cvModel = crossval.fit(train)

F1 - score for validation data

In [214]:
prediction = cvModel.transform(val)
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction)

0.5346165509987915

F1 - score for test data

In [212]:
prediction = cvModel.transform(test)

In [213]:
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction)

0.5476418235041257

### Logistic Regression

In [139]:
lr = LogisticRegression()
ovr2 = OneVsRest(classifier=lr, featuresCol='normFeatures',labelCol='labeled')

pipeline3 = Pipeline(stages = [normalizer, ovr2])

In [140]:
paramGrid1 = ParamGridBuilder() \
    .addGrid(lr.maxIter, [500]) \
    .build()

crossval1 = CrossValidator(estimator=pipeline3,
                          estimatorParamMaps=paramGrid1,
                          evaluator=MulticlassClassificationEvaluator(labelCol="labeled"),
                          numFolds=2)

cvModel1 = crossval1.fit(train)

In [207]:
prediction = cvModel1.transform(val)

F1 score for validation data

In [209]:
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction)

0.5234466603538767

F1 score for test data

In [210]:
prediction = cvModel1.transform(test)

In [211]:
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction)

0.5430884865303869

## Using only TF-IDF

In [147]:
df_selected.show()

+--------------------+--------------------+
|          reviewText|            category|
+--------------------+--------------------+
|This was a gift f...|Patio_Lawn_and_Garde|
|This is a very ni...|Patio_Lawn_and_Garde|
|The metal base wi...|Patio_Lawn_and_Garde|
|For the most part...|Patio_Lawn_and_Garde|
|This hose is supp...|Patio_Lawn_and_Garde|
|This tool works v...|Patio_Lawn_and_Garde|
|This product is a...|Patio_Lawn_and_Garde|
|I was excited to ...|Patio_Lawn_and_Garde|
|I purchased the L...|Patio_Lawn_and_Garde|
|Never used a manu...|Patio_Lawn_and_Garde|
|Good price. Good ...|Patio_Lawn_and_Garde|
|I have owned the ...|Patio_Lawn_and_Garde|
|I had "won" a sim...|Patio_Lawn_and_Garde|
|The birds ate all...|Patio_Lawn_and_Garde|
|Bought last summe...|Patio_Lawn_and_Garde|
|I knew I had a mo...|Patio_Lawn_and_Garde|
|I was a little wo...|Patio_Lawn_and_Garde|
|I have used this ...|Patio_Lawn_and_Garde|
|I actually do not...|Patio_Lawn_and_Garde|
|Just what I  expe...|Patio_Lawn

Again splitting the data to train, val, test

In [141]:
(train1, next_data1) = df_selected.randomSplit([0.7,0.3], seed=24)

In [142]:
(val1, test1) = next_data1.randomSplit([0.8,0.2], seed=24)

In [151]:
regexTokenizer1 = RegexTokenizer(inputCol="reviewText", outputCol="words")
stopwords1 = StopWordsRemover.loadDefaultStopWords("english")
remover1 = StopWordsRemover(inputCol = "words", outputCol = "filtered", stopWords=stopwords)
hashingTF1 = HashingTF(inputCol="filtered", outputCol="tfs")
idf1 = IDF(inputCol='tfs', outputCol='idfs', minDocFreq=1)
stringIndexer1 = StringIndexer(inputCol="category", outputCol="labeled")
normalizer1 = Normalizer(inputCol="idfs", outputCol="normFeatures", p=1.0)
#pca = PCA(k = 15, inputCol = "normFeatures", outputCol="pca")


In [157]:
svm_1 = LinearSVC()
ovr3 = OneVsRest(classifier=svm_1, featuresCol='normFeatures',labelCol='labeled')


In [158]:
pipeline4 = Pipeline(stages=[regexTokenizer1, remover1, hashingTF1, idf1, stringIndexer1, normalizer1, ovr3])

In [159]:
paramGrid2 = ParamGridBuilder() \
    .addGrid(svm_1.regParam, [0.1]) \
    .build()

crossval3 = CrossValidator(estimator=pipeline4,
                          estimatorParamMaps=paramGrid2,
                          evaluator=MulticlassClassificationEvaluator(labelCol="labeled"),
                          numFolds=2)

cvModel3 = crossval3.fit(df_selected)

**validation f1 score**

In [204]:
prediction3 = cvModel3.transform(val1)

In [205]:
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction3)

0.9864421286679639

**test f1 score**

In [202]:
prediction3 = cvModel3.transform(test1)

In [203]:
evaluator = MulticlassClassificationEvaluator(labelCol="labeled",predictionCol="prediction")
evaluator.evaluate(prediction3)

0.9875127566451318

# Conslusion

We saw how the stram of the data with the help of Pipeline tool, we can specify all the needed steps from reading the data to making predictions.

Support Vector Machines are really powerful but slow for fitting big datasets. Both SVM and Logistig regression scored around 50% with chi selector. We had vector with 2000 features but a little values actually bring the knowledge.

In the last step I fitted only with TF-IDF in SVM and we scored 98% on both validation and tesing sets. We have again hudge dimension.
Although I tried to inplement PCA dimensionality reduction technique after normalizing the TF-IDF but I had some computational issues. 