In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("CW") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
sc

In [3]:
from time import time
from preprocessing.Preprocessor import Preprocessor
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from metrics.metrics import Metrics
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LogisticRegression
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [4]:
pp = Preprocessor(spark)

In [5]:
train = pp.to_tfidf('Cleaned Data/train.csv', min_frequency=275., remove_stop_words = False)

train_lr = train.sample(True, 0.6)
train_lr.cache()

train_nb = train.sample(True, 0.6)
train_nb.cache()

train_neurnet = train.sample(True, 0.6)
train_neurnet.cache()

DataFrame[features: vector, label: int]

In [6]:
test = pp.to_tfidf('Cleaned Data/test.csv')
test.cache()
test.count()

16393

In [7]:
"""test = test.dropDuplicates(['features'])
test.count()"""

"test = test.dropDuplicates(['features'])\ntest.count()"

In [8]:
nb = NaiveBayes(smoothing = 1.0, modelType="multinomial")
model_nb = nb.fit(train_nb)

nn = MultilayerPerceptronClassifier(layers=[1948, 64, 3], blockSize=32, seed=99)
model_nn = nn.fit(train_neurnet)

lr = LogisticRegression(maxIter=100, aggregationDepth = 2, elasticNetParam=0)
model_lr = lr.fit(train_lr)

In [31]:
test_nb = model_nb.transform(test).select('features','prediction', 'label')
test_nb_update = test_nb.withColumnRenamed("prediction","prediction_nb")
test_nb_update = test_nb_update.withColumnRenamed("label","label_nb")
#test_nb_update = test_nb_update.withColumnRenamed("features","features_nb")
test_nb_update = test_nb_update.dropDuplicates(["features"])

test_nb_update.show(5)
test_m = Metrics()
test_m.f1(test_nb)
test_nb_update.count()

+--------------------+-------------+--------+
|            features|prediction_nb|label_nb|
+--------------------+-------------+--------+
|(1948,[0,1,2,3,4,...|          2.0|       1|
|(1948,[0,1,2,3,4,...|          1.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|
|(1948,[5,22,40,66...|          0.0|       0|
|(1948,[0,1,2,3,4,...|          1.0|       2|
+--------------------+-------------+--------+
only showing top 5 rows

F1 score for given DataFrame: 70.0



15854

In [32]:
test_nn = model_nn.transform(test).select('features','prediction', 'label')
test_nn_update = test_nn.withColumnRenamed("prediction","prediction_nn")
test_nn_update = test_nn_update.withColumnRenamed("label","label_nn")
#test_nn_update = test_nn_update.withColumnRenamed("features","features_nn")

test_nn_update.show(5)
test_m.f1(test_nn)
test_nn_update.count()

+--------------------+-------------+--------+
|            features|prediction_nn|label_nn|
+--------------------+-------------+--------+
|(1948,[0,1,3,7,8,...|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       1|
|(1948,[0,2,3,4,5,...|          2.0|       2|
|(1948,[0,1,2,4,5,...|          2.0|       2|
+--------------------+-------------+--------+
only showing top 5 rows

F1 score for given DataFrame: 78.0



16393

In [34]:
test_lr = model_lr.transform(test).select('features','prediction', 'label')
test_lr_update = test_lr.withColumnRenamed("prediction","prediction_lr")
test_lr_update = test_lr_update.withColumnRenamed("label","label_lr")
#test_lr_update = test_lr_update.withColumnRenamed("features","features_lr")
test_lr_update = test_lr_update.dropDuplicates(["features"])

test_lr_update.show(5)
test_m.f1(test_lr)
test_lr_update.count()

+--------------------+-------------+--------+
|            features|prediction_lr|label_lr|
+--------------------+-------------+--------+
|(1948,[0,1,2,3,4,...|          2.0|       1|
|(1948,[0,1,2,3,4,...|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|
|(1948,[5,22,40,66...|          0.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|       2|
+--------------------+-------------+--------+
only showing top 5 rows

F1 score for given DataFrame: 75.2



15854

In [35]:
test_voting = test_nn_update.join(test_lr_update, ["features"])
test_voting.show()
test_voting.count()

+--------------------+-------------+--------+-------------+--------+
|            features|prediction_nn|label_nn|prediction_lr|label_lr|
+--------------------+-------------+--------+-------------+--------+
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       0|          0.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          1.0|       1|          0.0|       1|
|(1948,[0,1,2,3,4,...|          2.0|       2|          1.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       0|          2.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       2|          0.0|       2|
|(1948,[0,1,2,3,4,...|          0.

16393

In [36]:
test_voting = test_voting.join(test_nb_update, ["features"])
test_voting.show()
test_voting.count()

+--------------------+-------------+--------+-------------+--------+-------------+--------+
|            features|prediction_nn|label_nn|prediction_lr|label_lr|prediction_nb|label_nb|
+--------------------+-------------+--------+-------------+--------+-------------+--------+
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|          1.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       0|          0.0|       0|          0.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|          2.0|       2|
|(1948,[0,1,2,3,4,...|          1.0|       1|          0.0|       1|          1.0|       1|
|(1948,[0,1,2,3,4,...|          2.0|       2|          1.0|       2|          1.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|       0|          2.0|       0|          1.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|       2|          2.0|       2|          0.

16393

In [37]:
test_voting = test_voting.drop("label_nn","label_lr")

In [38]:
test_voting.show(5)

+--------------------+-------------+-------------+-------------+--------+
|            features|prediction_nn|prediction_lr|prediction_nb|label_nb|
+--------------------+-------------+-------------+-------------+--------+
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          2.0|       2|
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          1.0|       2|
|(1948,[0,1,2,3,4,...|          0.0|          0.0|          0.0|       0|
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          2.0|       2|
|(1948,[0,1,2,3,4,...|          1.0|          0.0|          1.0|       1|
+--------------------+-------------+-------------+-------------+--------+
only showing top 5 rows



In [39]:
test_voting = test_voting.withColumnRenamed("label_nb","label")

In [41]:
test_voting.show(5)

+--------------------+-------------+-------------+-------------+-----+
|            features|prediction_nn|prediction_lr|prediction_nb|label|
+--------------------+-------------+-------------+-------------+-----+
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          2.0|    2|
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          1.0|    2|
|(1948,[0,1,2,3,4,...|          0.0|          0.0|          0.0|    0|
|(1948,[0,1,2,3,4,...|          2.0|          2.0|          2.0|    2|
|(1948,[0,1,2,3,4,...|          1.0|          0.0|          1.0|    1|
+--------------------+-------------+-------------+-------------+-----+
only showing top 5 rows



In [62]:
from collections import Counter
from pyspark.sql.functions import udf

@udf
def mode(*v):
    counter = Counter(x for x in v if x is not None)
    if len(counter) > 0:
        return counter.most_common(1)[0][0]
    else:
        return None

test_voting = test_voting.withColumn('prediction', mode('prediction_nn', 'prediction_lr', 'prediction_nb'))

In [43]:
test_voting.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction_nn: double (nullable = false)
 |-- prediction_lr: double (nullable = false)
 |-- prediction_nb: double (nullable = false)
 |-- label: integer (nullable = true)
 |-- prediction: string (nullable = true)



In [44]:
test_voting = test_voting.withColumn("prediction",test_voting.prediction.cast('double'))

In [45]:
test_voting.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction_nn: double (nullable = false)
 |-- prediction_lr: double (nullable = false)
 |-- prediction_nb: double (nullable = false)
 |-- label: integer (nullable = true)
 |-- prediction: double (nullable = true)



In [46]:
test_m.f1(test_voting)

F1 score for given DataFrame: 77.3

