In [28]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from pyspark.ml import *
from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml.param import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import rand 
from sklearn.metrics import classification_report
from time import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import isnan, when, count, col
spark = SparkSession.builder.getOrCreate()

In [29]:
sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

df=spark.read.option("delimiter", "\t").csv("amazon_reviews_us_Wireless_v1_00.tsv",header=True)
df.show()
df.count()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   16414143|R3W4P9UBGNGH1U|B00YL0EKWE|     852431543|LG G4 Case Hard T...|        Wireless|          2|            1|          3|   N|                Y|Looks good, funct...|2 issues  -  Once...| 2015-08-31|
|         US|   50800750|R15V54KBMTQWAY|B00XK95RPQ|     516894650|Selfie Stick Fibl...|        Wireless|          4|    

263652

ADDING THE LABELS

Here we are adding the labels to the code that we will be using to run the sentimental analysis. we will label the reviews of 3 stars with an indicator of 0, reviews with less than 3 stars will be labeled with an indicator of -1 and reviews with greater than 3 stars will be labeled with an indicator of 1. So we add the column with the indicators.

In [30]:
df = df.filter((df.star_rating != 3))
df=df.withColumn('label', f.when(f.col('star_rating')<3,0).when(f.col('star_rating')>3,1))


In [31]:
df.registerTempTable("df")
# convert String labels to Double type
df = df.withColumn("label", df.label.cast(DoubleType()))
df.show()
df.count()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+-----+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|label|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+-----+
|         US|   16414143|R3W4P9UBGNGH1U|B00YL0EKWE|     852431543|LG G4 Case Hard T...|        Wireless|          2|            1|          3|   N|                Y|Looks good, funct...|2 issues  -  Once...| 2015-08-31|  0.0|
|         US|   50800750|R15V54KBMTQWAY|B00XK95RPQ|     516894650|Selfie Stick Fibl...|        W

243024

In [32]:

df=df.select('review_body','label')
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()



+-----------+-----+
|review_body|label|
+-----------+-----+
|         70|    0|
+-----------+-----+



In [33]:
df=df.where(col("review_body").isNotNull()).where(col("label").isNotNull())
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()
df=df.sample(withReplacement=False, fraction=0.0025)
df.count()

+-----------+-----+
|review_body|label|
+-----------+-----+
|          0|    0|
+-----------+-----+



607

In [34]:
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
# tokenizer 
tokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern="\W")##'\w' remove none-word letters
df_tokenized = tokenizer.transform(df)
# remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df_removed = remover.transform(df_tokenized)
# Convert to TF words vector
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
df_TF = hashingTF.transform(df_removed)
# Convert to TF*IDF words vector
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(df_TF)
df_idf = idfModel.transform(df_TF)
for features_label in df_idf.select("features", "label").take(3):
    print(features_label)


Row(features=SparseVector(262144, {181321: 1.9215, 201386: 2.7993, 261870: 1.5198}), label=1.0)
Row(features=SparseVector(262144, {3121: 2.8838, 74577: 5.0239, 89530: 4.1076, 99172: 4.3307, 101534: 4.213, 113432: 1.9443, 118907: 5.0239, 126194: 5.3116, 130636: 4.0123, 137765: 3.5198, 155284: 5.3116, 178140: 3.7711, 192310: 3.6709, 192648: 3.1143, 199295: 5.717, 216118: 4.1076, 227001: 4.213, 254292: 5.717}), label=1.0)
Row(features=SparseVector(262144, {112971: 3.8452, 122367: 4.8007, 254061: 3.5198}), label=0.0)


# Splitting the data

In [35]:
# Split data aproximately into training (80%) and test (20%)
(train, test)=df.randomSplit([0.8,0.2], seed = 0)
# Cache the train and test data in-memory 
train = train.cache()
test = test.cache()
print ('Sample number in the train set : {}'.format(train.count()))
print ('Sample number in the test set : {}'.format(test.count()))
train.groupby('label').count().toPandas()

Sample number in the train set : 483
Sample number in the test set : 124


Unnamed: 0,label,count
0,0.0,111
1,1.0,372


## Data Modeling

## Logistic Regression

In [19]:
def grid_search(p1,p2,p3,p4):
    lr = LogisticRegression()
    pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, lr])
  
    #Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
                 .addGrid(hashingTF.numFeatures, [p1])
                 .addGrid(lr.regParam, [p2])
                 .addGrid(lr.elasticNetParam, [p3])
                 .addGrid(lr.maxIter, [p4])
                 .build())
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    # average cross-validation accuracy metric/s on all folds
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    return average_score[0]

In [20]:
score=0.0
for p1 in [45000,50000,55000]:
    for p2 in [0.09,0.10,0.11]:
        for p3 in [0.09,0.10,0.11]:
            for p4 in [9,10,11]:
                t0 = time()
                print ('(numFeatures,regParam,elasticNetParam,maxIter)=({},{},{},{})'.format(p1,p2,p3,p4))
                average_score=grid_search(p1,p2,p3,p4)
                tt = time() - t0
                print ("Classifier trained in {} seconds".format(round(tt,3)))
                if average_score > score:
                    print ('################ Best score ######################')
                    params=(p1,p2,p3,p4)
                    score=average_score
print ('Best score is {} at params ={}'.format(score, params))

(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.09,9)
average cross-validation accuracy = 0.19049908689822936
Classifier trained in 13.121 seconds
################ Best score ######################
(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.09,10)
average cross-validation accuracy = 0.19484691298518592
Classifier trained in 13.258 seconds
################ Best score ######################
(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.09,11)
average cross-validation accuracy = 0.1927282689173893
Classifier trained in 13.855 seconds
(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.1,9)
average cross-validation accuracy = 0.1927282689173893
Classifier trained in 11.991 seconds
(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.1,10)
average cross-validation accuracy = 0.1927282689173893
Classifier trained in 13.474 seconds
(numFeatures,regParam,elasticNetParam,maxIter)=(45000,0.09,0.1,11)
average cross-validation accuracy

In [21]:
def Data_modeling(train, test, pipeline, paramGrid):
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    
    ########  Make predictions on on the test data
    prediction = cvModel.transform(test)
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    ######## Calculate accuracy of the prediction of the test data
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy_score=evaluator.evaluate(prediction)
    # another way to calculate accuracy 
    #correct=prediction.filter(prediction['label']== prediction['prediction']).select("label","prediction")
    #accuracy_score = correct.count() / float(test.count())  
    print ('Accuracy in the test data = {}'.format(accuracy_score))
    
    ######## calculate F1 score of the prediction of the test data
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1_score=evaluator.evaluate(prediction)
    print ('F1 score in the test data = {}'.format(f1_score))
    # Calculate area under ROC for the prediction of the test data
    #evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
    #ROC_score=evaluator.evaluate(prediction)
    #print 'areaUnderROC in the test data = {}'.format(ROC_score)
    
    ######## Print classification_report
    prediction_and_labels=prediction.select("label","prediction")
    y_true = []
    y_pred = []
    for x in prediction_and_labels.collect():
        xx = list(x)
        try:
            tt = int(xx[1])
            pp = int(xx[0])
            y_true.append(tt)
            y_pred.append(pp)
        except:
            continue

    target_names = ['neg 0', 'pos 1']
    print (classification_report(y_true, y_pred, target_names=target_names))
    return 


In [22]:
# trained by a logistic regression 
lr = LogisticRegression()
# Build a pipeline
pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, lr])

# Create ParamGrid for Cross Validation 
paramGrid = (ParamGridBuilder()
             .addGrid(hashingTF.numFeatures, [50000])
             .addGrid(lr.regParam, [0.10])
             .addGrid(lr.elasticNetParam, [0.10])
             .addGrid(lr.maxIter, [10])
             .build())
# Execute 4-folds cross validation for hyperparameter tuning, model prediction and model evaluation.
Data_modeling(train, test, pipeline, paramGrid)

average cross-validation accuracy = 0.18819957278209004
Accuracy in the test data = 0.21487603305785125
F1 score in the test data = 0.7366233766233766
              precision    recall  f1-score   support

       neg 0       0.24      0.89      0.38         9
       pos 1       0.99      0.78      0.87       112

    accuracy                           0.79       121
   macro avg       0.62      0.83      0.63       121
weighted avg       0.93      0.79      0.83       121



## Naive Bayes


In [23]:
def grid_search(p1,p2):
    nb = NaiveBayes()
    pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, nb])
  
    #Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
                 .addGrid(hashingTF.numFeatures, [p1])
                 .addGrid(nb.smoothing, [p2])
                 .build())
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    # average cross-validation accuracy metric/s on all folds
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    return average_score[0]

In [24]:
score=0.0
for p1 in [35000,40000,55000]:
    for p2 in [0.8,0.9,1.0]:
      t0 = time()
      print ('(numFeatures,smoothing)=({},{})'.format(p1,p2))
      average_score=grid_search(p1,p2)
      tt = time() - t0
      print ("Classifier trained in {} seconds".format(round(tt,3)))
      if average_score > score:
        print ('################ Best score ######################')
        params=(p1,p2)
        score=average_score
print ('Best score is {} at params ={}'.format(score, params))


(numFeatures,smoothing)=(35000,0.8)
average cross-validation accuracy = 0.20321643252573782
Classifier trained in 23.545 seconds
################ Best score ######################
(numFeatures,smoothing)=(35000,0.9)
average cross-validation accuracy = 0.209575105339492
Classifier trained in 19.103 seconds
################ Best score ######################
(numFeatures,smoothing)=(35000,1.0)
average cross-validation accuracy = 0.21172847404094375
Classifier trained in 16.986 seconds
################ Best score ######################
(numFeatures,smoothing)=(40000,0.8)
average cross-validation accuracy = 0.20756151800232991
Classifier trained in 17.226 seconds
(numFeatures,smoothing)=(40000,0.9)
average cross-validation accuracy = 0.21384711810874035
Classifier trained in 17.721 seconds
################ Best score ######################
(numFeatures,smoothing)=(40000,1.0)
average cross-validation accuracy = 0.21594795844487483
Classifier trained in 17.789 seconds
################ Best sc

In [25]:
# trained by a Naïve Bayes 
nb = NaiveBayes()
# Build a pipeline
pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, nb])
# Create ParamGrid for Cross Validation 
paramGrid = (ParamGridBuilder()
             .addGrid(hashingTF.numFeatures, [40000])
             .addGrid(nb.smoothing, [1.0])
             .build())
# Execute 4-folds cross validation for hyperparameter tuning, model prediction and model evaluation.
Data_modeling(train, test, pipeline, paramGrid)


average cross-validation accuracy = 0.21594795844487483
Accuracy in the test data = 0.2231404958677686
F1 score in the test data = 0.7227057616573451
              precision    recall  f1-score   support

       neg 0       0.21      0.88      0.34         8
       pos 1       0.99      0.77      0.87       113

    accuracy                           0.78       121
   macro avg       0.60      0.82      0.60       121
weighted avg       0.94      0.78      0.83       121



# Decision Tree

In [48]:
def grid_search(p1,p2,p3):
    # trained by a Decision Tree 
    dt = DecisionTreeClassifier(labelCol="indexedLabel",impurity="entropy")
    pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, dt])
  
    #Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
                 .addGrid(hashingTF.numFeatures, [p1])
                 .addGrid(dt.maxDepth, [p2])
                 .addGrid(dt.minInstancesPerNode, [p3])
                 .build())
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    # average cross-validation accuracy metric/s on all folds
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    return average_score[0]

In [49]:
score=0.0
for p1 in [65000,70000,75000]:
    for p2 in [23,24,25,26,27]:
        for p3 in [3,4,5]:
          t0 = time()
          print ('(numFeatures,maxDepth,minInstancesPerNode)=({},{},{})'.format(p1,p2,p3))
          average_score=grid_search(p1,p2,p3)
          tt = time() - t0
          print ("Classifier trained in {} seconds".format(round(tt,3)))
          if average_score > score:
            print ('################ Best score ######################')
            params=(p1,p2,p3)
            score=average_score
print ('Best score is {} at params ={}'.format(score, params))

(numFeatures,maxDepth,minInstancesPerNode)=(65000,23,3)
average cross-validation accuracy = 0.7841078654887432
Classifier trained in 44.117 seconds
################ Best score ######################
(numFeatures,maxDepth,minInstancesPerNode)=(65000,23,4)
average cross-validation accuracy = 0.7777084639498433
Classifier trained in 45.186 seconds
(numFeatures,maxDepth,minInstancesPerNode)=(65000,23,5)
average cross-validation accuracy = 0.7651997720148189
Classifier trained in 43.725 seconds
(numFeatures,maxDepth,minInstancesPerNode)=(65000,24,3)
average cross-validation accuracy = 0.7861739811912225
Classifier trained in 44.937 seconds
################ Best score ######################
(numFeatures,maxDepth,minInstancesPerNode)=(65000,24,4)
average cross-validation accuracy = 0.7777084639498433
Classifier trained in 45.239 seconds
(numFeatures,maxDepth,minInstancesPerNode)=(65000,24,5)
average cross-validation accuracy = 0.7651997720148189
Classifier trained in 44.279 seconds
(numFeatur

In [50]:
# trained by a Decision Tree 
dt = DecisionTreeClassifier(labelCol="indexedLabel",impurity="entropy")
# Build a pipeline
pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, dt])
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(hashingTF.numFeatures, [70000])
             .addGrid(dt.maxDepth, [25])
             .addGrid(dt.minInstancesPerNode, [4])
             .build())
# Execute 4-folds cross validation for hyperparameter tuning, model prediction and model evaluation.
Data_modeling(train, test, pipeline, paramGrid)

average cross-validation accuracy = 0.7777084639498433
Accuracy in the test data = 0.7258064516129032
F1 score in the test data = 0.24689048437658007
              precision    recall  f1-score   support

       neg 0       0.70      0.21      0.32       102
       pos 1       0.14      0.59      0.22        22

    accuracy                           0.27       124
   macro avg       0.42      0.40      0.27       124
weighted avg       0.60      0.27      0.30       124



## Random Forest 

In [51]:
def grid_search(p1,p2,p3,p4):
    rf = RandomForestClassifier(labelCol="indexedLabel",impurity="entropy", seed=5043)
    pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, rf])
  
    #Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
                 .addGrid(hashingTF.numFeatures, [p1])
                 .addGrid(rf.numTrees, [p2])
                 .addGrid(rf.maxDepth, [p3])
                 .addGrid(rf.minInstancesPerNode, [p4])
                 .build())
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    # average cross-validation accuracy metric/s on all folds
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    return average_score[0]

In [52]:
score=0.0
for p1 in [45000,50000,55000]:
    for p2 in [30,31,32]:
        for p3 in [28,29,30]:
            for p4 in [1,2]:
                t0 = time()
                print ('(numFeatures,numTrees,maxDepth,minInstancesPerNode)=({},{},{},{})'.format(p1,p2,p3,p4))
                average_score=grid_search(p1,p2,p3,p4)
                tt = time() - t0
                print ("Classifier trained in {} seconds".format(round(tt,3)))
                if average_score > score:
                  print ('################ Best score ######################')
                  params=(p1,p2,p3,p4)
                  score=average_score
print ('Best score is {} at params ={}'.format(score, params))

(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,28,1)
average cross-validation accuracy = 0.7742657452265602
Classifier trained in 43.519 seconds
################ Best score ######################
(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,28,2)
average cross-validation accuracy = 0.7701105728127671
Classifier trained in 41.207 seconds
(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,29,1)
average cross-validation accuracy = 0.7742657452265602
Classifier trained in 48.099 seconds
(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,29,2)
average cross-validation accuracy = 0.7701105728127671
Classifier trained in 43.465 seconds
(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,30,1)
average cross-validation accuracy = 0.7742657452265602
Classifier trained in 48.497 seconds
(numFeatures,numTrees,maxDepth,minInstancesPerNode)=(45000,30,30,2)
average cross-validation accuracy = 0.7701105728127671
Classifier trained in 43.7

In [53]:
rf = RandomForestClassifier(labelCol="indexedLabel",impurity="entropy", seed=5043)
# Build a pipeline
pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, rf])

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(hashingTF.numFeatures, [50000])
             .addGrid(rf.numTrees, [31])
             .addGrid(rf.maxDepth, [29])
             .addGrid(rf.minInstancesPerNode, [1])
             .build())
# Execute 4-folds cross validation for hyperparameter tuning, model prediction and model evaluation.
Data_modeling(train, test, pipeline, paramGrid)

average cross-validation accuracy = 0.7701766885152466
Accuracy in the test data = 0.7580645161290323
F1 score in the test data = 0.09426057813154588
              precision    recall  f1-score   support

       neg 0       1.00      0.24      0.39       124
       pos 1       0.00      0.00      0.00         0

    accuracy                           0.24       124
   macro avg       0.50      0.12      0.19       124
weighted avg       1.00      0.24      0.39       124



## Gradient Boosted Tree 

In [55]:
def grid_search(p1,p2,p3,p4):
    gbt = GBTClassifier(labelCol="indexedLabel")
    pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, gbt])
  
    #Create ParamGrid for Cross Validation
    paramGrid = (ParamGridBuilder()
                 .addGrid(hashingTF.numFeatures, [p1])
                 .addGrid(gbt.maxIter, [p2]) #(default: 20)
                 .addGrid(gbt.maxDepth, [p3])
                 .addGrid(gbt.minInstancesPerNode, [p4])
                 .build())
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=4)
    
    ########  Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    # average cross-validation accuracy metric/s on all folds
    average_score = cvModel.avgMetrics
    print ('average cross-validation accuracy = {}'.format(average_score[0]))
    return average_score[0]

In [59]:
score=0.0
for p1 in [60000]:
    for p2 in [25,26,27]:
        for p3 in [18,19,20]:
            for p4 in [2]:
                t0 = time()
                print ('(numFeatures,maxIter,maxDepth,minInstancesPerNode)=({},{},{},{})'.format(p1,p2,p3,p4))
                average_score=grid_search(p1,p2,p3,p4)
                tt = time() - t0
                print ("Classifier trained in {} seconds".format(round(tt,3)))
                if average_score > score:
                  print ('################ Best score ######################')
                  params=(p1,p2,p3,p4)
                  score=average_score
print ('Best score is {} at params ={}'.format(score, params))

(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,25,18,2)
average cross-validation accuracy = 0.7795735252208607
Classifier trained in 819.233 seconds
################ Best score ######################
(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,25,19,2)
average cross-validation accuracy = 0.7757057566258194
Classifier trained in 809.788 seconds
(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,25,20,2)
average cross-validation accuracy = 0.7754642348247364
Classifier trained in 834.909 seconds
(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,26,18,2)
average cross-validation accuracy = 0.7796625819321743
Classifier trained in 799.533 seconds
################ Best score ######################
(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,26,19,2)
average cross-validation accuracy = 0.7756396409233399
Classifier trained in 852.7 seconds
(numFeatures,maxIter,maxDepth,minInstancesPerNode)=(60000,26,20,2)
average cross-validation accuracy

In [60]:
# trained by a Gradient Boosted Tree 
gbt = GBTClassifier(labelCol="indexedLabel")
# Build a pipeline
pipeline = Pipeline(stages=[labelIndexer,tokenizer, remover, hashingTF, idfModel, gbt])
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(hashingTF.numFeatures, [60000])
             .addGrid(gbt.maxIter, [25]) #(default: 20)
             .addGrid(gbt.maxDepth, [19])
             .addGrid(gbt.minInstancesPerNode, [2])
             .build())
# Execute 4-folds cross validation for hyperparameter tuning, model prediction and model evaluation.
Data_modeling(train, test, pipeline, paramGrid)


average cross-validation accuracy = 0.7757057566258194
Accuracy in the test data = 0.7661290322580645
F1 score in the test data = 0.22385001794796977
              precision    recall  f1-score   support

       neg 0       0.53      0.16      0.25        97
       pos 1       0.14      0.48      0.21        27

    accuracy                           0.23       124
   macro avg       0.34      0.32      0.23       124
weighted avg       0.45      0.23      0.24       124



## Evaluation of performance 