# Import Libraries

In [None]:
%pyspark
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

@F.udf("String")
def decode_tokens(tokens):
  return tokenizer.decode(tokens)

sqc = SQLContext(sc)

# Schema

In [3]:
%pyspark
fields = list()
fields.append(T.StructField("text_tokens",  T.StringType(), True))
fields.append(T.StructField("hashtags", T.StringType(), True))
fields.append(T.StructField("tweet_id", T.StringType(), True))
fields.append(T.StructField("present_media", T.StringType(), True))
fields.append(T.StructField("present_links", T.StringType(), True))
fields.append(T.StructField("present_domains", T.StringType(), True))
fields.append(T.StructField("tweet_type", T.StringType(), True))
fields.append(T.StructField("language", T.StringType(), True))
fields.append(T.StructField("timestamp", T.LongType(), True))

fields.append(T.StructField("engager_user_id", T.StringType(), True))
fields.append(T.StructField("engager_follower_count", T.LongType(), True))
fields.append(T.StructField("engager_following_count", T.LongType(), True))
fields.append(T.StructField("engager_is_verified", T.BooleanType(), True))
fields.append(T.StructField("engager_account_creation_time", T.LongType(), True))

fields.append(T.StructField("engagee_user_id", T.StringType(), True))
fields.append(T.StructField("engagee_follower_count", T.LongType(), True))
fields.append(T.StructField("engagee_following_count", T.LongType(), True))
fields.append(T.StructField("engagee_is_verified", T.BooleanType(), True))
fields.append(T.StructField("engagee_account_creation_time", T.LongType(), True))

fields.append(T.StructField("engagee_follows_engager", T.BooleanType(), True))
fields.append(T.StructField("reply_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("retweet_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("retweet_with_comment_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("like_engagement_timestamp", T.LongType(), True))
schema = T.StructType(fields)

UsageError: Line magic function `%pyspark` not found.


# File Path

In [None]:
%pyspark
training_file_path = "/PATH/training.tsv"
validation_file_path = "/PATH/val.tsv"
test_file_path = "/PATH/competition_test.tsv"

# Reading Training Data

In [None]:
%pyspark
training_df = sqc.read.option("sep", chr(1)).schema(schema).csv(training_file_path)
training_df = training_df.withColumn("text_tokens", F.split("text_tokens", "\t"))
training_df = training_df.withColumn("hashtags", F.split("hashtags", "\t"))
training_df = training_df.withColumn("present_media", F.split("present_media", "\t"))
training_df = training_df.withColumn("present_links", F.split("present_links", "\t"))
training_df = training_df.withColumn("present_domains", F.split("present_domains", "\t"))
# training_df.printSchema()
# training_df.show()

# Reading Validation Data

In [None]:
%pyspark
validation_df = sqc.read.option("sep", chr(1)).schema(schema).csv(validation_file_path)
validation_df = validation_df.withColumn("text_tokens", F.split("text_tokens", "\t"))
validation_df = validation_df.withColumn("hashtags", F.split("hashtags", "\t"))
validation_df = validation_df.withColumn("present_media", F.split("present_media", "\t"))
validation_df = validation_df.withColumn("present_links", F.split("present_links", "\t"))
validation_df = validation_df.withColumn("present_domains", F.split("present_domains", "\t"))

# validation_df.show()

# Reading Test Data

In [None]:
%pyspark
test_df = sqc.read.option("sep", chr(1)).schema(schema).csv(test_file_path)
test_df = test_df.withColumn("text_tokens", F.split("text_tokens", "\t"))
test_df = test_df.withColumn("hashtags", F.split("hashtags", "\t"))
test_df = test_df.withColumn("present_media", F.split("present_media", "\t"))
test_df = test_df.withColumn("present_links", F.split("present_links", "\t"))
test_df = test_df.withColumn("present_domains", F.split("present_domains", "\t"))
# test_df.printSchema()

# Save data as parquet

In [None]:
%pyspark
training_parquet_path = "/PATH/training_df"
validation_parquet_path = "/PATH/val_df"
test_parquet_path = "/PATH/test_df"

training_df.write.parquet(training_parquet_path)
validation_df.write.parquet(validation_parquet_path)
test_df.write.parquet(test_parquet_path)

# Read parquet data

In [None]:
training_parquet_path = "/PATH/training_df"
validation_parquet_path = "/PATH/val_df"
test_parquet_path = "/PATH/test_df"

training_df = sqc.read.parquet(training_parquet_path)
validation_df = sqc.read.parquet(validation_parquet_path)
test_df = sqc.read.parquet(test_parquet_path)