In [305]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, unix_timestamp, col, to_date, to_timestamp
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType, DoubleType, ByteType, ShortType, LongType, FloatType, BooleanType



In [324]:
def getType(raw):
    switch = {  
    "byte": ByteType(),
    "short": ShortType(),
    "integer": IntegerType(),
    "long": LongType(),
    "float": FloatType(),
    "number": DoubleType(),
    "boolean": BooleanType(),
    "datetime": TimestampType(),
    }
    return switch.get(raw, StringType())
    

In [317]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext

In [318]:
df = spark.read.option("multiline", True).json("./DM-classification.json")

In [319]:
df.printSchema()

root
 |-- data: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: long (nullable = true)
 |    |    |-- label_1: string (nullable = true)
 |    |    |-- label_2: string (nullable = true)
 |    |    |-- label_3: double (nullable = true)
 |    |    |-- label_4: string (nullable = true)
 |-- schema: struct (nullable = true)
 |    |-- fields: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |    |-- pandas_version: string (nullable = true)
 |    |-- primaryKey: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [320]:
df.show()

+--------------------+--------------------+
|                data|              schema|
+--------------------+--------------------+
|[[The battery is ...|[[[index, integer...|
+--------------------+--------------------+



In [321]:
# tf = df.select(explode(df["data"])).toDF("temp").select("temp.content", "temp.label", "temp.label_1", "temp.label_2", "temp.label_3", "temp.label_4")
# tf = tf.withColumn("datetime", col(tf.label_4).cast('timestamp'))


tf = df.select(explode(df["data"])).toDF("temp").select("temp.content", "temp.label", "temp.label_1", "temp.label_2", "temp.label_3", "temp.label_4")
tf = tf.withColumn("datetime", to_timestamp(unix_timestamp(col('label_4'),"yyyy-MM-dd'T'HH:mm:ss.SSSXXX").cast('timestamp')))
tf = tf.drop("label_4")

tf.show()

+--------------------+-----+-------+--------+------------+-------------------+
|             content|label|label_1| label_2|     label_3|           datetime|
+--------------------+-----+-------+--------+------------+-------------------+
|The battery is co...|    0|  small|separate|0.7155163569|2015-06-05 18:41:08|
|What a big waste ...|    0| medium|conected| 0.858630808|2016-10-29 12:12:46|
|Don't waste your ...|    0|  large|conected|0.2040485858|2016-04-29 14:44:31|
|Great sound and s...|    1|  large|separate| 0.332641236|2017-12-26 13:25:48|
|Really pleased wi...|    1| medium|conected| 0.887390017|2016-04-30 00:01:08|
|One of my favorit...|    1|  large|conected|0.2305351126|2016-04-30 17:29:03|
|best bluetooth on...|    1| medium|conected|0.4549175852|2017-04-24 04:26:54|
|Authentic leather...|    1|  large|conected|0.3198441525|2015-12-16 22:03:11|
|I was very excite...|    1| medium|conected| 0.835863266|2015-05-19 01:34:19|
|Do not make the s...|    0|  small|conected|0.14423

In [322]:
simpleData = list(tf.collect())

In [357]:
col_list = df.select(explode(df["schema.fields"])).toDF("level1").select("level1.type")
datatype_list = [row[0] for row in col_list.select("type").collect()]

print(datatype_list)

getType(datatype_list[5])

['integer', 'string', 'integer', 'string', 'string', 'number', 'datetime']


DoubleType

In [361]:
simpleSchema = StructType([
    StructField("content", getType(datatype_list[1]),True),
    StructField("label", getType(datatype_list[2]),True),
    StructField("size", getType(datatype_list[3]),True),
    StructField("usage", getType(datatype_list[4]),True),
    StructField("effect", getType(datatype_list[5]),True),
    StructField("date", getType(datatype_list[6]),True)
])

In [362]:
new_df = spark.createDataFrame(data=simpleData, schema=simpleSchema)

In [363]:
new_df.printSchema()

root
 |-- content: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- usage: string (nullable = true)
 |-- effect: double (nullable = true)
 |-- date: timestamp (nullable = true)



In [364]:
new_df.show()

+--------------------+-----+------+--------+------------+-------------------+
|             content|label|  size|   usage|      effect|               date|
+--------------------+-----+------+--------+------------+-------------------+
|The battery is co...|    0| small|separate|0.7155163569|2015-06-05 18:41:08|
|What a big waste ...|    0|medium|conected| 0.858630808|2016-10-29 12:12:46|
|Don't waste your ...|    0| large|conected|0.2040485858|2016-04-29 14:44:31|
|Great sound and s...|    1| large|separate| 0.332641236|2017-12-26 13:25:48|
|Really pleased wi...|    1|medium|conected| 0.887390017|2016-04-30 00:01:08|
|One of my favorit...|    1| large|conected|0.2305351126|2016-04-30 17:29:03|
|best bluetooth on...|    1|medium|conected|0.4549175852|2017-04-24 04:26:54|
|Authentic leather...|    1| large|conected|0.3198441525|2015-12-16 22:03:11|
|I was very excite...|    1|medium|conected| 0.835863266|2015-05-19 01:34:19|
|Do not make the s...|    0| small|conected|0.1442302304|2015-02

In [365]:
new_df.createOrReplaceTempView("DM")

In [366]:
selectdf = spark.sql("Select * from DM where label=1")
groupedby =spark.sql("SELECT *, date - lag(date,1) OVER(PARTITION BY size order by date) as diff_date FROM DM")
groupedby.printSchema()
# print(type(groupedby["diff_date"][0]))
groupedby = groupedby.withColumn("diff_date", col("diff_date").cast('String'))
groupedby.printSchema()
# groupedby.show()



root
 |-- content: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- usage: string (nullable = true)
 |-- effect: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- diff_date: interval (nullable = true)

root
 |-- content: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- usage: string (nullable = true)
 |-- effect: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- diff_date: string (nullable = true)



In [368]:
groupedby.repartition(1).write.format('csv').save("./output/myfile1.csv", header = 'true')