In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
# import pandas as pd
spark = SparkSession.builder.appName("PySpark SQL Practice").getOrCreate()
sc = spark.sparkContext

posts_dist = sc.textFile("./data/posts_dist.csv")
users_train = sc.textFile("./data/users_train_dist.csv")
users_val_dist = sc.textFile("./data/users_val_dist.csv")



In [3]:
# Get Dataframes from RDDs 
def convertToDataFrameFromRDD(rdd):
    header = rdd.first()
    rdd.filter(lambda line: line != header)
    tmp = rdd.map(lambda k: k.split(","))
    return tmp.toDF(header.split(","))

posts_dist_df = convertToDataFrameFromRDD(posts_dist)
users_train_df = convertToDataFrameFromRDD(users_train)
users_val_dist_df = convertToDataFrameFromRDD(users_val_dist)


In [11]:

from pyspark.sql import functions as F
from pyspark.sql.functions import when, sum, avg, col, count
from pyspark.sql.types import LongType, IntegerType
from pyspark.sql.functions import udf
import pyspark.sql.functions as f

# Binning DaysDiff into Early, Mid, Late.
def categorizer(DaysDiff):
    if DaysDiff <= 30:
        return "early"
    elif DaysDiff < 210:
        return "mid"
    else:
        return "late"

def ExtractFeatures(df, dfu):
    # Cast some numeric columns to IntegerType()
    df = df.withColumn("PostId", df['PostId'].cast(IntegerType()))
    df = df.withColumn("AcceptedAnswerId", df['AcceptedAnswerId'].cast(IntegerType()))
    df = df.withColumn("CommentCount", df['CommentCount'].cast(IntegerType()))
    df = df.withColumn("BodyWordNum", df['BodyWordNum'].cast(IntegerType()))
    df = df.withColumn("Score", df['Score'].cast(IntegerType()))
    df = df.withColumn("OwnerUserId", df['OwnerUserId'].cast(IntegerType()))

    # Calculate date difference between CreationDate and CreationDateOfOwner
    df = df.withColumn("DaysDiff", F.datediff(df['CreationDate'], df['CreationDateOfOwner']))
    df = df.withColumn("DaysDiff", df["DaysDiff"].cast(IntegerType()))

    df = df.filter(df.DaysDiff >= 0)
    df = df.withColumn("DaysDiff", df["DaysDiff"].cast(IntegerType()))

    df = df.select("OwnerUserId","CreationDateOfOwner", "PostId", "CreationDate", "DaysDiff", "Score","AnswerCount", "CommentCount", "BodyWordNum", "AcceptedAnswerId")
    bucket_udf = udf(categorizer, StringType())
    df = df.withColumn("When", bucket_udf("DaysDiff"))

    dp = df.groupBy("OwnerUserId", "When").agg(when(col("When") == "early", count("*")).alias("EarlyPosts"), when(col("When") == "mid", count("*")).alias("MidPost"), when(col("When") == "late", count("*")).alias("LatePost"))
    dp = dp.drop("When")
    dp = dp.na.fill(0)
    dp = dp.groupBy("OwnerUserId").sum()
    # Construct per-user statistics
    users_df = df.groupBy("OwnerUserId").agg(F.count("PostId").alias("TotalPost"), F.sum("Score").alias("TotalScore"), F.sum("AnswerCount").alias("TotalAnswers"), F.sum("CommentCount").alias("TotalCmt"), F.sum("BodyWordNum").alias("TotalWords"))

    users_df = users_df.join(dfu, users_df.OwnerUserId == dfu.Id)
    users_df = users_df.drop("Id")
    users_df = users_df.drop("CreationDate")
    users_df = users_df.join(dp, ['OwnerUserId'])
    users_df = users_df.drop("sum(OwnerUserId)")
    # users_df = users_df.drop("OwnerUserId")
    users_df = users_df.withColumnRenamed("sum(EarlyPosts)", "#Early"). \
                                            withColumnRenamed("sum(MidPost)", "#Mid"). \
                                            withColumnRenamed("sum(LatePost)", "#Late")

# users_df = users_df.select('isChurn').rdd.map(lambda x: 0 if x == "False" else x)
# users_df = users_df['isChurn'].replace(True, 1, inplace=True)
    users_df = users_df.withColumn( "isChurn" , F.when( F.col("isChurn")=="True" , F.lit(1) ).otherwise(0) )
    return users_df 




In [14]:
#   df = posts_dist_df
#     dfu = users_train_df
train_df = ExtractFeatures(posts_dist_df, users_train_df)
train_df.show()

+-----------+---------+----------+------------+--------+----------+-------+------+----+-----+
|OwnerUserId|TotalPost|TotalScore|TotalAnswers|TotalCmt|TotalWords|isChurn|#Early|#Mid|#Late|
+-----------+---------+----------+------------+--------+----------+-------+------+----+-----+
|        148|        3|        98|        19.0|       3|       277|      0|     0|   3|    0|
|       1088|        5|       212|        29.0|      13|      1010|      0|     1|   2|    2|
|       1580|       13|       143|        54.0|       7|      1560|      0|     5|   2|    6|
|       1645|        1|        46|        12.0|       0|       129|      0|     0|   0|    1|
|       1959|        6|       208|        33.0|       2|       548|      0|     0|   5|    1|
|       2122|       14|       467|        50.0|      20|      3977|      0|     2|  10|    2|
|       3175|       12|       311|       111.0|      18|      1791|      0|     3|   8|    1|
|       3749|        1|         3|         4.0|       0|    

In [15]:
val_df = ExtractFeatures(posts_dist_df, users_val_dist_df)
val_df.show()

+-----------+---------+----------+------------+--------+----------+-------+------+----+-----+
|OwnerUserId|TotalPost|TotalScore|TotalAnswers|TotalCmt|TotalWords|isChurn|#Early|#Mid|#Late|
+-----------+---------+----------+------------+--------+----------+-------+------+----+-----+
|       7754|        3|       190|        58.0|       2|       315|      0|     3|   0|    0|
|       8086|        1|         2|        15.0|       0|        33|      1|     1|   0|    0|
|      15846|        5|        30|        32.0|       0|       192|      0|     5|   0|    0|
|      16861|        1|         3|         1.0|       2|        49|      1|     1|   0|    0|
|      25462|        4|        81|        28.0|       2|       285|      0|     4|   0|    0|
|      28577|        1|         2|         1.0|       2|        24|      1|     1|   0|    0|
|      42834|        1|         5|         2.0|       0|        47|      1|     1|   0|    0|
|      48254|        1|         2|         6.0|       0|    

In [17]:
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.sql.functions import col
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.classification import LinearSVC

assembler = VectorAssembler(
       inputCols=["TotalPost", "TotalScore", "TotalAnswers", "TotalCmt", "TotalWords", "#Early", "#Mid", "#Late"],
       outputCol="features")

train_df_transformed = assembler.transform(train_df)
final_df = train_df_transformed.select(col("isChurn").alias("label"), col("features"))
# transformed_x.select('features').show()
# transformed_x.show()
# final.show()

# model = SVMWithSGD.train(final, iterations=100)
lsvc = LinearSVC(maxIter=10, regParam=0.1)
lsvcModel = lsvc.fit(final_df)


In [18]:
val_df_transformed = assembler.transform(val_df)
final_df = train_df_transformed.select(col("isChurn").alias("label"), col("features"))
lsvcModel.predict()

TypeError: predict() missing 1 required positional argument: 'value'

In [None]:
# Construct validation x,y
val_x = users_val_dist_df.join(posts_dist_df, posts_dist_df.OwnerUserId == users_val_dist_df.Id)
val_x.show()