In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("CW") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
sc

In [3]:
from pyspark.sql import SparkSession
from preprocessing.Preprocessor import Preprocessor
from pyspark.ml.classification import LogisticRegression
from metrics.metrics import Metrics
from pyspark.ml.tuning import TrainValidationSplit, CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.sql.functions as F
import matplotlib.pyplot as plt

In [4]:
pp = Preprocessor(spark)

In [5]:
train = pp.to_tfidf('Cleaned Data/train.csv', min_frequency=275., remove_stop_words = False)
test =  pp.to_tfidf('Cleaned Data/test.csv')
train.cache()
test.cache()

DataFrame[features: vector, label: int]

In [6]:
train.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(1948,[0,1,2,3,4,...|    2|
|(1948,[0,1,6,7,16...|    0|
|(1948,[0,1,2,3,4,...|    0|
|(1948,[5,27,34,74...|    2|
|(1948,[0,1,2,4,5,...|    2|
+--------------------+-----+
only showing top 5 rows



In [8]:
############MODEL 1

In [7]:
train_size = train.count()
train = train.withColumn("weight",F.lit(1))
train.show(5)

+--------------------+-----+------+
|            features|label|weight|
+--------------------+-----+------+
|(1948,[0,1,2,3,4,...|    2|     1|
|(1948,[0,1,6,7,16...|    0|     1|
|(1948,[0,1,2,3,4,...|    0|     1|
|(1948,[5,27,34,74...|    2|     1|
|(1948,[0,1,2,4,5,...|    2|     1|
+--------------------+-----+------+
only showing top 5 rows



In [9]:
train_sort = train.sort(F.desc("weight"))
train_sort.show(5)

+--------------------+-----+------+
|            features|label|weight|
+--------------------+-----+------+
|(1948,[0,1,2,3,4,...|    2|     1|
|(1948,[0,1,6,7,16...|    0|     1|
|(1948,[0,1,2,3,4,...|    0|     1|
|(1948,[5,27,34,74...|    2|     1|
|(1948,[0,1,2,4,5,...|    2|     1|
+--------------------+-----+------+
only showing top 5 rows



In [10]:
#train_sample = train_sort.limit(int(train_size*0.6))
#train_sample.count()

In [11]:
from pyspark.ml.classification import LogisticRegression

In [12]:
lr = LogisticRegression(maxIter=100, aggregationDepth = 2, elasticNetParam=0)

In [15]:
model_1 = lr.fit(train_sort)
model_1_predictions = model_1.transform(train_sort).select('features','prediction', 'label', 'weight')
model_1_predictions.show()

+--------------------+----------+-----+------+
|            features|prediction|label|weight|
+--------------------+----------+-----+------+
|(1948,[0,1,2,3,4,...|       2.0|    2|     1|
|(1948,[0,1,6,7,16...|       2.0|    0|     1|
|(1948,[0,1,2,3,4,...|       0.0|    0|     1|
|(1948,[5,27,34,74...|       2.0|    2|     1|
|(1948,[0,1,2,4,5,...|       2.0|    2|     1|
|(1948,[0,1,2,3,4,...|       0.0|    0|     1|
|(1948,[0,1,2,3,4,...|       2.0|    2|     1|
|(1948,[0,1,3,4,5,...|       2.0|    2|     1|
|(1948,[3,23,27,33...|       2.0|    2|     1|
|(1948,[0,1,3,4,5,...|       0.0|    0|     1|
|(1948,[0,1,2,3,5,...|       2.0|    2|     1|
|(1948,[0,1,2,3,7,...|       0.0|    1|     1|
|(1948,[0,1,2,3,4,...|       0.0|    0|     1|
|(1948,[0,1,4,7,9,...|       2.0|    2|     1|
|(1948,[0,1,2,5,6,...|       2.0|    2|     1|
|(1948,[2,5,10,14,...|       2.0|    2|     1|
|(1948,[0,1,2,4,5,...|       2.0|    2|     1|
|(1948,[0,1,2,3,4,...|       2.0|    1|     1|
|(1948,[0,1,2

In [18]:
model_1_predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- label: integer (nullable = true)
 |-- weight: integer (nullable = false)



In [20]:
model_1_predictions = model_1_predictions.withColumn("prediction",F.col("prediction").cast("Integer"))
model_1_predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- weight: integer (nullable = false)



In [21]:
model_1_predictions = model_1_predictions.withColumn("weight", 
                    F.when(F.col("prediction") != F.col("Label"),2).otherwise(1))

In [24]:
model_1_predictions = model_1_predictions.withColumnRenamed("weight","weight_update")
model_1_predictions = model_1_predictions.withColumnRenamed("label_misc","label")
model_1_predictions.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,2,3,4,...|         2|    2|            1|
|(1948,[0,1,6,7,16...|         2|    0|            2|
|(1948,[0,1,2,3,4,...|         0|    0|            1|
|(1948,[5,27,34,74...|         2|    2|            1|
|(1948,[0,1,2,4,5,...|         2|    2|            1|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [None]:
#train_data_update = train.join(train_model_preds, ["features"])
#train_data_update.show(5)

In [25]:
new_training_data = model_1_predictions.sort(F.desc("weight_update"))

In [26]:
new_training_data.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,3,8,16...|         2|    0|            2|
|(1948,[0,1,2,3,4,...|         1|    2|            2|
|(1948,[0,1,2,3,4,...|         0|    1|            2|
|(1948,[0,1,3,7,15...|         2|    0|            2|
|(1948,[0,3,4,5,7,...|         0|    2|            2|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [None]:
############MODEL 2

In [27]:
train_data_model_2 = new_training_data.limit(int(train_size*0.6))
train_data_model_2 = train_data_model_2.drop("prediction")
train_data_model_2.show(5)

+--------------------+-----+-------------+
|            features|label|weight_update|
+--------------------+-----+-------------+
|(1948,[0,1,6,7,16...|    0|            2|
|(1948,[0,1,2,3,7,...|    1|            2|
|(1948,[0,1,2,3,4,...|    1|            2|
|(1948,[0,1,2,3,4,...|    0|            2|
|(1948,[0,1,2,4,6,...|    0|            2|
+--------------------+-----+-------------+
only showing top 5 rows



In [30]:
model_2 = lr.fit(train_data_model_2)
model_2_predictions = model_2.transform(train_data_model_2).select('features','prediction', 'label', 'weight_update')
model_2_predictions.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,6,7,16...|       2.0|    0|            2|
|(1948,[0,1,2,3,7,...|       0.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    0|            2|
|(1948,[0,1,2,4,6,...|       2.0|    0|            2|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [31]:
model_2_predictions = model_2_predictions.withColumn("weight_update", 
                    F.when(F.col("prediction") != F.col("Label"),2).otherwise(1))

In [32]:
model_2_predictions.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,6,7,16...|       2.0|    0|            2|
|(1948,[0,1,2,3,7,...|       0.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    0|            2|
|(1948,[0,1,2,4,6,...|       2.0|    0|            2|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [35]:
new_training_data_2 = model_2_predictions.sort(F.desc("weight_update"))
new_training_data_2.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,6,7,16...|       2.0|    0|            2|
|(1948,[0,1,2,3,7,...|       0.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    0|            2|
|(1948,[0,1,2,4,6,...|       2.0|    0|            2|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [33]:
############MODEL 3

In [36]:
train_data_model_3 = new_training_data_2.limit(int(train_size*0.6))
train_data_model_3 = train_data_model_3.drop("prediction")
train_data_model_3.show(5)

+--------------------+-----+-------------+
|            features|label|weight_update|
+--------------------+-----+-------------+
|(1948,[0,1,6,7,16...|    0|            2|
|(1948,[0,1,2,3,7,...|    1|            2|
|(1948,[0,1,2,3,4,...|    1|            2|
|(1948,[0,1,2,3,4,...|    0|            2|
|(1948,[0,1,2,4,6,...|    0|            2|
+--------------------+-----+-------------+
only showing top 5 rows



In [37]:
model_3 = lr.fit(train_data_model_3)
model_3_predictions = model_3.transform(train_data_model_3).select('features','prediction', 'label', 'weight_update')
model_3_predictions.show(5)

+--------------------+----------+-----+-------------+
|            features|prediction|label|weight_update|
+--------------------+----------+-----+-------------+
|(1948,[0,1,6,7,16...|       2.0|    0|            2|
|(1948,[0,1,2,3,7,...|       0.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    1|            2|
|(1948,[0,1,2,3,4,...|       2.0|    0|            2|
|(1948,[0,1,2,4,6,...|       2.0|    0|            2|
+--------------------+----------+-----+-------------+
only showing top 5 rows



In [None]:
############################Test Models on Test Data

In [40]:
model_1_test = model_1.transform(test).select('features','prediction','label')
model_2_test = model_2.transform(test).select('features','prediction')
model_3_test = model_3.transform(test).select('features','prediction')

In [41]:
model_1_test.show(5)

+--------------------+----------+-----+
|            features|prediction|label|
+--------------------+----------+-----+
|(1948,[0,1,3,7,8,...|       2.0|    2|
|(1948,[0,1,2,3,4,...|       2.0|    2|
|(1948,[0,1,2,3,4,...|       2.0|    1|
|(1948,[0,2,3,4,5,...|       2.0|    2|
|(1948,[0,1,2,4,5,...|       2.0|    2|
+--------------------+----------+-----+
only showing top 5 rows



In [42]:
model_1_test = model_1_test.withColumnRenamed("prediction","prediction_1")
model_2_test = model_2_test.withColumnRenamed("prediction","prediction_2")
model_3_test = model_3_test.withColumnRenamed("prediction","prediction_3")

In [43]:
############# Voting

In [44]:
test_voting = model_1_test.join(model_2_test, ["features"])

In [45]:
test_voting.show(5)

+--------------------+------------+-----+------------+
|            features|prediction_1|label|prediction_2|
+--------------------+------------+-----+------------+
|(1948,[0,1,3,7,8,...|         2.0|    2|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    1|         2.0|
|(1948,[0,2,3,4,5,...|         2.0|    2|         2.0|
+--------------------+------------+-----+------------+
only showing top 5 rows



In [46]:
test_voting = test_voting.join(model_3_test, ["features"])
test_voting.show(5)

+--------------------+------------+-----+------------+------------+
|            features|prediction_1|label|prediction_2|prediction_3|
+--------------------+------------+-----+------------+------------+
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    0|         2.0|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|
+--------------------+------------+-----+------------+------------+
only showing top 5 rows



In [47]:
from collections import Counter
from pyspark.sql.functions import udf

@udf
def mode(*v):
    counter = Counter(x for x in v if x is not None)
    if len(counter) > 0:
        return counter.most_common(1)[0][0]
    else:
        return None

test_voting = test_voting.withColumn('prediction', mode('prediction_1', 'prediction_2', 'prediction_3'))

In [48]:
test_voting.show(5)

+--------------------+------------+-----+------------+------------+----------+
|            features|prediction_1|label|prediction_2|prediction_3|prediction|
+--------------------+------------+-----+------------+------------+----------+
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|       2.0|
|(1948,[0,1,2,3,4,...|         2.0|    0|         2.0|         2.0|       2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|       2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|       2.0|
|(1948,[0,1,2,3,4,...|         2.0|    2|         2.0|         2.0|       2.0|
+--------------------+------------+-----+------------+------------+----------+
only showing top 5 rows



In [49]:
test_voting = test_voting.withColumn("prediction",test_voting.prediction.cast('double'))

In [51]:
test_m = Metrics()
test_m.f1(test_voting)

F1 score for given DataFrame: 76.2



In [None]:
######################## Ignore code below!!!

In [None]:
train_model_preds = train_model.transform(train_sample).select('features','prediction', 'label', 'weight')

In [None]:
train_model_preds = train_model_preds.withColumn("prediction",F.col("prediction").cast("Integer"))

In [None]:
train_model_preds = train_model_preds.withColumn("weight", F.when(F.col("prediction") != F.col("Label"),2)
      .otherwise(1))

In [None]:
train_model_preds = train_model_preds.withColumnRenamed("weight","weight_update")
train_model_preds = train_model_preds.withColumnRenamed("label","label_misc")

In [None]:
train_data_update = train.join(train_model_preds, ["features"])

In [None]:
train_data_update = train_data_update.sort(F.desc("weight_update"))

In [None]:
#
train_sample = train_data_update.limit(int(train_size*0.75))

In [None]:
train_sample = train_sample.drop("prediction")

In [None]:
train_model = lr.fit(train_sample)

In [None]:
##################################################################

In [None]:
test_m = Metrics()

In [None]:
test_model = train_model.transform(test).select('prediction', 'label')

In [None]:
test_m.f1(test_model)

In [None]:
test_m.confusion_matrix(test_model)