In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("CW") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
sc

In [3]:
from pyspark.sql import SparkSession
from preprocessing.Preprocessor import Preprocessor
import pyspark.sql.functions as F

In [4]:
pp = Preprocessor(spark)

In [5]:
train = pp.to_word2vec('Cleaned Data/train.csv', vector_size=100)
#train = train.sample(False,0.6)
test =  pp.to_word2vec('Cleaned Data/test.csv')
train.cache()
test.cache()

DataFrame[features: vector, label: int]

In [6]:
train.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.03181895008726...|    2|
|[0.03697207064500...|    0|
|[-0.0298445346707...|    0|
|[-0.0209716792645...|    2|
|[-0.0022496435571...|    2|
+--------------------+-----+
only showing top 5 rows



In [7]:
train_rdd = train.rdd.repartition(10)

In [8]:
test_rdd = test.rdd.repartition(5)

In [9]:
train_column_names = train.columns
test_column_names = test.columns
column_names = train.columns

In [10]:
import pandas as pd

def toPandas_partition(instances):
    panda_df = pd.DataFrame(columns = column_names)
  
    for instance in instances:
        panda_df = panda_df.append(instance.asDict(), ignore_index=True)
    
    return [panda_df]

In [11]:
train_rdd_pandas = train_rdd.mapPartitions(toPandas_partition)
test_rdd_pandas = test_rdd.mapPartitions(toPandas_partition)

In [12]:
def build_model(partition):
    
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn import metrics
    import numpy as np
    
    AdaModel = AdaBoostClassifier(n_estimators=50, learning_rate = 1)
    
    X_train = partition['features'].apply(lambda x : np.array(x.toArray())).to_numpy().reshape(-1,1)
    X_train = np.apply_along_axis(lambda x : x[0], 1, X_train)
    Y_train = partition['label']
    Y_train = Y_train.astype('int')
    
    print(X_train.shape)
    
    model = AdaModel.fit(X_train, Y_train)
    
    return model

In [13]:
models = train_rdd_pandas.map(build_model).collect()



In [None]:
type(models)
len(models)

In [14]:
import numpy as np

def test_classifier(partition):
    predictions = []
    
    X_test = partition['features'].apply(lambda x : np.array(x.toArray())).to_numpy().reshape(-1,1)
    X_test = np.apply_along_axis(lambda x : x[0], 1, X_test)
     
    for m in models:
        predictions.append(m.predict(X_test).tolist())
        
    return predictions

In [None]:
"""def aggregate_predictions(preds):
    from collections import Counter
    
    agg = [0]*len(preds[0]) # list of 0s where I will aggregate the resutls
    
    for lst in preds:
        for i in range(len(lst)):
            agg[i] += lst[i]
                

    mode = [Counter(agg).most_common(1)[0][0]]
                
    return mode"""

In [18]:
def aggregate_predictions(preds):
    from collections import Counter
    
    for lst in preds:
        mode = [Counter(lst).most_common(1)[0][0]]
            
    return mode

In [19]:
predictions = test_rdd_pandas.map(test_classifier).map(aggregate_predictions)

In [20]:
y_pred = predictions.collect()

In [21]:
y_test = list(map(lambda x: int(x.label), test.select("label").collect()))

In [22]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [23]:
f1_score(y_test, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [16393, 5]

In [None]:
accuracy_score(y_test, y_pred)

In [24]:
len(y_test)

16393

In [25]:
len(y_pred)

5