In [None]:
%pyspark
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

@F.udf("String")
def decode_tokens(tokens):
  return tokenizer.decode(tokens)

sqc = SQLContext(sc)

In [None]:
%pyspark
root_file_path = "/PATH/temp/"
training_parquet_path = root_file_path+"training_df"
validation_parquet_path = root_file_path+"val_df"
test_parquet_path = root_file_path+"test_df"

training_df = sqc.read.parquet(training_parquet_path)
validation_df = sqc.read.parquet(validation_parquet_path)
test_df = sqc.read.parquet(test_parquet_path)

In [None]:
%pyspark
training_data = training_df\
.withColumn("reply", F.when(F.col("reply_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("retweet", F.when(F.col("retweet_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("rtWithCmt", F.when(F.col("retweet_with_comment_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("like", F.when(F.col("like_engagement_timestamp").isNull(), 0).otherwise(1))\
.select("tweet_id", "engager_user_id", "engagee_user_id", "reply", "retweet", "rtWithCmt", "like")

In [None]:
%pyspark

training_data = training_df\
.withColumn("reply", F.when(F.col("reply_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("retweet", F.when(F.col("retweet_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("rtWithCmt", F.when(F.col("retweet_with_comment_engagement_timestamp").isNull(), 0).otherwise(1))\
.withColumn("like", F.when(F.col("like_engagement_timestamp").isNull(), 0).otherwise(1))\
.select("tweet_id", "engager_user_id", "engagee_user_id", "reply", "retweet", "rtWithCmt", "like")

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler

create_tweet_features = PipelineModel.load(root_file_path+"create_tweet_features_model")
create_engager_user_features = PipelineModel.load(root_file_path+"create_engager_user_features_model")
create_engagee_user_features = PipelineModel.load(root_file_path+"create_engagee_user_features_model")

tweet_features = sqc.read.parquet(root_file_path+"training_tweets")
tweet_features = create_tweet_features.transform(tweet_features).select("tweet_id", "tweet_features")

engager_features = sqc.read.parquet(root_file_path+"training_engager_user_df.parquet")
engager_features = create_engager_user_features.transform(engager_features).select("engager_user_id", "engager_features")

engagee_features = sqc.read.parquet(root_file_path+"training_engagee_user_df.parquet")
engagee_features = create_engagee_user_features.transform(engagee_features).select("engagee_user_id", "engagee_features")

training_data = training_data.join(tweet_features, "tweet_id")
training_data = training_data.join(engager_features, "engager_user_id")
training_data = training_data.join(engagee_features, "engagee_user_id")
# training_data.show()

assemblerInputs = ["tweet_features", "engager_features", "engagee_features"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")

training_data = assembler.transform(training_data).drop(*assemblerInputs)
training_data.write.parquet(root_file_path+"training_data.parquet")

In [None]:
training_data = sqc.read.parquet(root_file_path+"training_data.parquet")

In [None]:
training_data.show()

# Train for Likes

In [None]:
%pyspark
from pyspark.ml.classification import GBTClassifier

gbt_like_model = GBTClassifier(labelCol="like", featuresCol="featuresAssembled").fit(training_data)
gbt_like_model.save(root_file_path+"models/gbt_like2")

In [None]:
%pyspark
from pyspark.ml.classification import GBTClassificationModel
like_model_loaded = GBTClassificationModel.load(root_file_path+"models/gbt_like")

# Train for Reply

In [None]:
%pyspark
from pyspark.ml.classification import GBTClassifier

gbt_reply_model = GBTClassifier(labelCol="reply", featuresCol="featuresAssembled").fit(training_data)
gbt_reply_model.save(root_file_path+"models/gbt_reply2")

In [None]:
reply_model_loaded = GBTClassificationModel.load(root_file_path+"models/gbt_reply")

# Train for Retweet

In [None]:
%pyspark
from pyspark.ml.classification import GBTClassifier

gbt_retweet_model = GBTClassifier(labelCol="retweet", featuresCol="featuresAssembled").fit(training_data)
gbt_retweet_model.save(root_file_path+"models/gbt_retweet2")

In [None]:
%pyspark
retweet_model_loaded = GBTClassificationModel.load(root_file_path+"models/gbt_retweet")

# Train for RTwithCmt

In [None]:
%pyspark
from pyspark.ml.classification import GBTClassifier

gbt_rtWithCmt_model = GBTClassifier(labelCol="rtWithCmt", featuresCol="featuresAssembled").fit(training_data)
gbt_rtWithCmt_model.save(root_file_path+"models/gbt_rtWithCmt2")


In [None]:
%pyspark
retwithCmt_model_loaded = GBTClassificationModel.load(root_file_path+"models/gbt_rtWithCmt")

In [None]:
%pyspark
validation_data = validation_df\
.select("tweet_id", "engager_user_id", "engagee_user_id")

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler

create_tweet_features = PipelineModel.load(root_file_path+"create_tweet_features_model")
create_engager_user_features = PipelineModel.load(root_file_path+"create_engager_user_features_model")
create_engagee_user_features = PipelineModel.load(root_file_path+"create_engagee_user_features_model")

validation_features = sqc.read.parquet(root_file_path+"validation_tweets")
validation_features = create_tweet_features.transform(validation_features).select("tweet_id", "tweet_features")

engager_features = sqc.read.parquet(root_file_path+"validation_engager_user_df.parquet")
engager_features = create_engager_user_features.transform(engager_features).select("engager_user_id", "engager_features")

engagee_features = sqc.read.parquet(root_file_path+"validation_engagee_user_df.parquet")
engagee_features = create_engagee_user_features.transform(engagee_features).select("engagee_user_id", "engagee_features")

validation_data = validation_data.join(validation_features, "tweet_id")
validation_data = validation_data.join(engager_features, "engager_user_id")
validation_data = validation_data.join(engagee_features, "engagee_user_id")
# validation_data.show()

assemblerInputs = ["tweet_features", "engager_features", "engagee_features"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")

validation_data = assembler.transform(validation_data).drop(*assemblerInputs)
validation_data.write.parquet(root_file_path+"validation_data.parquet")

In [None]:
validation_data = sqc.read.parquet(root_file_path+"validation_data.parquet")

# Predict for like on validation data

In [None]:
%pyspark

validation_data = sqc.read.parquet(root_file_path+"validation_data.parquet")

print("making predictions ...")
predictions = gbt_like_model.transform(validation_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"validaiton_predictions/like2.csv")

# Predict for Reply on validation data

In [None]:
%pyspark

print("making predictions ...")
predictions = gbt_reply_model.transform(validation_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"validaiton_predictions/reply2.csv")


# Predict for retweet on validation data

In [None]:
%pyspark
print("making predictions ...")
predictions = gbt_retweet_model.transform(validation_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"validaiton_predictions/retweet2.csv")

# Predict for rtWithCmt on validation data

In [None]:
%pyspark
print("making predictions ...")
predictions = gbt_rtWithCmt_model.transform(validation_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"validaiton_predictions/rtWithCmt2.csv")

In [None]:
%pyspark
test_data = test_df\
.select("tweet_id", "engager_user_id", "engagee_user_id")

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler

create_tweet_features = PipelineModel.load(root_file_path+"create_tweet_features_model")
create_engager_user_features = PipelineModel.load(root_file_path+"create_engager_user_features_model")
create_engagee_user_features = PipelineModel.load(root_file_path+"create_engagee_user_features_model")

test_features = sqc.read.parquet(root_file_path+"test_tweets")
test_features = create_tweet_features.transform(test_features).select("tweet_id", "tweet_features")

engager_features = sqc.read.parquet(root_file_path+"test_engager_user_df.parquet")
engager_features = create_engager_user_features.transform(engager_features).select("engager_user_id", "engager_features")

engagee_features = sqc.read.parquet(root_file_path+"test_engagee_user_df.parquet")
engagee_features = create_engagee_user_features.transform(engagee_features).select("engagee_user_id", "engagee_features")

test_data = test_data.join(test_features, "tweet_id")
test_data = test_data.join(engager_features, "engager_user_id")
test_data = test_data.join(engagee_features, "engagee_user_id")
# test_data.show()

assemblerInputs = ["tweet_features", "engager_features", "engagee_features"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")

test_data = assembler.transform(test_data).drop(*assemblerInputs)
test_data.write.parquet(root_file_path+"test_data.parquet")

In [None]:
%pyspark
test_data = sqc.read.parquet(root_file_path+"test_data.parquet")

# Predict for like on test data

In [None]:
%pyspark

print("making predictions ...")
predictions = gbt_like_model.transform(test_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"predictions/like2.csv")

# Predict for Reply on test data

In [None]:
%pyspark
print("making predictions ...")
predictions = gbt_reply_model.transform(test_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"predictions/reply2.csv")

# Predict for rtWithCmt on test data

In [None]:
%pyspark
print("making predictions ...")
predictions = gbt_rtWithCmt_model.transform(test_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"predictions/rtWithCmt2.csv")

# Predict for retweet on test data

In [None]:
%pyspark
print("making predictions ...")
predictions = gbt_retweet_model.transform(test_data)

print("writring predictions to file")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv(root_file_path+"predictions/retweet2.csv")