# Read the Data

In [None]:
%pyspark
root_file_path = "/PATH/temp/"
training_parquet_path = root_file_path+"training_df"
validation_parquet_path = root_file_path+"val_df"
test_parquet_path = root_file_path+"test_df"

training_df = sqc.read.parquet(training_parquet_path)
validation_df = sqc.read.parquet(validation_parquet_path)
test_df = sqc.read.parquet(test_parquet_path)

# Preparing the Training dataset

In [None]:
training_engager_user_df = training_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))\
.withColumn("is_bot_engager", F.when(((F.col("engager_following_count") < 4700) | (F.col("engager_following_count") > 5100)), 0).otherwise(1))

training_engager_user_df.write.parquet(root_file_path+"training_engager_user_df.parquet")

training_engagee_user_df = training_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))\
.withColumn("is_bot_engagee", F.when(((F.col("engagee_following_count") < 4700) | (F.col("engagee_following_count") > 5100)), 0).otherwise(1))

training_engagee_user_df.write.parquet(root_file_path+"training_engagee_user_df.parquet")

# Preparing the Validation dataset

In [None]:
%pyspark

validation_engager_user_df = validation_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))\
.withColumn("is_bot_engager", F.when(((F.col("engager_following_count") < 4700) | (F.col("engager_following_count") > 5100)), 0).otherwise(1))

validation_engager_user_df.write.parquet(root_file_path+"validation_engager_user_df.parquet")

validation_engagee_user_df = validation_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))\
.withColumn("is_bot_engagee", F.when(((F.col("engagee_following_count") < 4700) | (F.col("engagee_following_count") > 5100)), 0).otherwise(1))

validation_engagee_user_df.write.parquet(root_file_path+"validation_engagee_user_df.parquet")

# Preparing the Test dataset

In [None]:
%pyspark

test_engager_user_df = test_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))\
.withColumn("is_bot_engager", F.when(((F.col("engager_following_count") < 4700) | (F.col("engager_following_count") > 5100)), 0).otherwise(1))

test_engager_user_df.write.parquet(root_file_path+"test_engager_user_df.parquet")

test_engagee_user_df = test_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))\
.withColumn("is_bot_engagee", F.when(((F.col("engagee_following_count") < 4700) | (F.col("engagee_following_count") > 5100)), 0).otherwise(1))

test_engagee_user_df.write.parquet(root_file_path+"test_engagee_user_df.parquet")

# Engager Features

In [None]:
%pyspark
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoderEstimator, VectorAssembler, StringIndexer, FeatureHasher
from pyspark.ml import Pipeline

# engager_user_df = training_df.dropDuplicates(["engager_user_id"]).select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")
# engager_user_df = engager_user_df.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))

stages = [] # stages in our Pipeline
numericalColumns = ["engager_follower_count", "engager_following_count"]
for numericalCol in numericalColumns:
    # stages += [QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")]
    qd = QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")
    encoder = OneHotEncoderEstimator(inputCols=[qd.getOutputCol()], outputCols=[numericalCol + "classVec"])
    stages += [qd, encoder]

# stages = [] # stages in second Pipeline
categoricalColumns = ["year_engager", "engager_is_verified", "is_bot_engager"]
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]
    
# pipeline2 = Pipeline(stages=stages)

# assemblerInputs = ["engager_follower_countclassVec", "engager_following_countclassVec", "year_engagerclassVec", "engager_is_verifiedclassVec", "is_bot_engagerclassVec"]
# assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")
# stages += [assembler]

featInputs = [c + "Bucket" for c in numericalColumns]
featInputs += ["year_engagerIndex", "engager_is_verifiedIndex", "is_bot_engager"]
stages += [FeatureHasher(numFeatures=16, inputCols=featInputs, outputCol="engager_features", categoricalCols=featInputs)]


engager_user_features = Pipeline(stages=stages).fit(training_engager_user_df)
engager_user_features.transform(training_engager_user_df).select("engager_user_id", "engager_features").show()

# Engagee Features

In [None]:
%pyspark
from pyspark.ml.feature import QuantileDiscretizer, OneHotEncoderEstimator, VectorAssembler, FeatureHasher
from pyspark.ml import Pipeline

# engager_user_df = training_df.dropDuplicates(["engager_user_id"]).select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")
# engager_user_df = engager_user_df.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))

stages = [] # stages in our Pipeline
numericalColumns = ["engagee_follower_count", "engagee_following_count"]
for numericalCol in numericalColumns:
    # stages += [QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")]
    qd = QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")
    encoder = OneHotEncoderEstimator(inputCols=[qd.getOutputCol()], outputCols=[numericalCol + "classVec"])
    stages += [qd, encoder]

# stages = [] # stages in second Pipeline
categoricalColumns = ["year_engagee", "engagee_is_verified", "is_bot_engagee"]
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]
    
# pipeline2 = Pipeline(stages=stages)

# assemblerInputs = ["engagee_follower_countclassVec", "engagee_following_countclassVec", "year_engageeclassVec", "engagee_is_verifiedclassVec", "is_bot_engageeclassVec"]
# assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")
# stages += [assembler]

featInputs = [c + "Bucket" for c in numericalColumns]
featInputs += ["year_engageeIndex", "engagee_is_verifiedIndex", "is_bot_engageeIndex"]
stages += [FeatureHasher(numFeatures=16, inputCols=featInputs, outputCol="engagee_features", categoricalCols=featInputs)]


engagee_user_features = Pipeline(stages=stages).fit(training_engagee_user_df)
engagee_user_features.transform(training_engagee_user_df).select("engagee_user_id", "engagee_features")

In [None]:
%pyspark
engager_user_features.save(root_file_path+"create_engager_user_features_model")
engagee_user_features.save(root_file_path+"create_engagee_user_features_model")

In [None]:
%pyspark
from pyspark.ml import PipelineModel
engager_user_features = PipelineModel.load(root_file_path+"create_engager_user_features_model")
engagee_user_features = PipelineModel.load(root_file_path+"create_engagee_user_features_model")

# Create engager and engagee features on test data

In [None]:
%pyspark
engager_user_features.transform(test_engager_user_df).select("engager_user_id", "engager_features").show()
engagee_user_features.transform(test_engagee_user_df).select("engagee_user_id", "engagee_features").show()