In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.\
format("csv").\
option("header","true").\
option("inferSchema", "True").\
load("/public/yelp-dataset/yelp_user.csv")

In [4]:
df.show(4)

+--------------------+------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|             user_id|  name|review_count|yelping_since|             friends|useful|funny|cool|fans|elite|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|
+--------------------+------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|JJ-aSuM4pCFPdkfoZ...| Chris|          10|   2013-09-24|0njfJmB-7n84DlIgU...|     0| 

In [5]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



In [6]:
df = spark.read.\
format("csv").\
option("header","true").\
option("inferSchema", "True").\
option("samplingRatio", .1).\
load("/public/yelp-dataset/yelp_user.csv")

In [7]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



In [8]:
df = spark.read.\
format("csv").\
load("/public/trendytech/datasets/orders_sample1.csv")

In [9]:
df.show(4)

+---+----------+-----+---------------+
|_c0|       _c1|  _c2|            _c3|
+---+----------+-----+---------------+
|  1|2013-07-25|11599|         CLOSED|
|  2|2013-07-25|  256|PENDING_PAYMENT|
|  3|2013-07-25|12111|       COMPLETE|
|  4|2013-07-25| 8827|         CLOSED|
+---+----------+-----+---------------+
only showing top 4 rows



In [10]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [11]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [12]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
load("/public/trendytech/datasets/orders_sample1.csv")

In [13]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [14]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [15]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status long'

In [16]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
load("/public/trendytech/datasets/orders_sample1.csv")

In [17]:
df.show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
|       1|2013-07-25|  11599|        null|
|       2|2013-07-25|    256|        null|
|       3|2013-07-25|  12111|        null|
|       4|2013-07-25|   8827|        null|
|       5|2013-07-25|  11318|        null|
|       6|2013-07-25|   7130|        null|
|       7|2013-07-25|   4530|        null|
|       8|2013-07-25|   2911|        null|
|       9|2013-07-25|   5657|        null|
|      10|2013-07-25|   5648|        null|
+--------+----------+-------+------------+



In [18]:
from pyspark.sql.types import *

In [19]:
order_schema_struct =StructType([
                                StructField('orderid', LongType()),
                                StructField('orderdate',DateType()),
                                StructField('customerid',IntegerType()),
                                StructField('orderStatus',StringType())
                                ])

In [20]:
df = spark.read.format("csv").schema(order_schema_struct).load("/public/trendytech/datasets/orders_sample1.csv")

In [21]:
df.show()

+-------+----------+----------+---------------+
|orderid| orderdate|customerid|    orderStatus|
+-------+----------+----------+---------------+
|      1|2013-07-25|     11599|         CLOSED|
|      2|2013-07-25|       256|PENDING_PAYMENT|
|      3|2013-07-25|     12111|       COMPLETE|
|      4|2013-07-25|      8827|         CLOSED|
|      5|2013-07-25|     11318|       COMPLETE|
|      6|2013-07-25|      7130|       COMPLETE|
|      7|2013-07-25|      4530|       COMPLETE|
|      8|2013-07-25|      2911|     PROCESSING|
|      9|2013-07-25|      5657|PENDING_PAYMENT|
|     10|2013-07-25|      5648|PENDING_PAYMENT|
+-------+----------+----------+---------------+



In [22]:
df.printSchema()

root
 |-- orderid: long (nullable = true)
 |-- orderdate: date (nullable = true)
 |-- customerid: integer (nullable = true)
 |-- orderStatus: string (nullable = true)



In [23]:
! hadoop fs -cat /public/trendytech/datasets/orders_sample2.csv

1,07-25-2013,11599,CLOSED
2,07-25-2013,256,PENDING_PAYMENT
3,07-25-2013,12111,COMPLETE
4,07-25-2013,8827,CLOSED
5,07-25-2013,11318,COMPLETE
6,07-25-2013,7130,COMPLETE
7,07-25-2013,4530,COMPLETE
8,07-25-2013,2911,PROCESSING
9,07-25-2013,5657,PENDING_PAYMENT
10,07-25-2013,5648,PENDING_PAYMENT


In [24]:
orders_schema = 'order_id long, order_date date, cust_id long, order_status string'

In [25]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
option("dateformat","mm-dd-yyyy").\
load("/public/trendytech/datasets/orders_sample2.csv")

In [26]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-01-25|  11599|         CLOSED|
|       2|2013-01-25|    256|PENDING_PAYMENT|
|       3|2013-01-25|  12111|       COMPLETE|
|       4|2013-01-25|   8827|         CLOSED|
|       5|2013-01-25|  11318|       COMPLETE|
|       6|2013-01-25|   7130|       COMPLETE|
|       7|2013-01-25|   4530|       COMPLETE|
|       8|2013-01-25|   2911|     PROCESSING|
|       9|2013-01-25|   5657|PENDING_PAYMENT|
|      10|2013-01-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [27]:
orders_schema = 'order_id long, order_date string, cust_id long, order_status string'

In [28]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
load("/public/trendytech/datasets/orders_sample2.csv")

In [29]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|07-25-2013|  11599|         CLOSED|
|       2|07-25-2013|    256|PENDING_PAYMENT|
|       3|07-25-2013|  12111|       COMPLETE|
|       4|07-25-2013|   8827|         CLOSED|
|       5|07-25-2013|  11318|       COMPLETE|
|       6|07-25-2013|   7130|       COMPLETE|
|       7|07-25-2013|   4530|       COMPLETE|
|       8|07-25-2013|   2911|     PROCESSING|
|       9|07-25-2013|   5657|PENDING_PAYMENT|
|      10|07-25-2013|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [30]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [31]:
from pyspark.sql.functions import to_date

In [32]:
df_new = df.withColumn("order_date",to_date("order_date","mm-dd-yyyy"))

In [33]:
df_new.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-01-25|  11599|         CLOSED|
|       2|2013-01-25|    256|PENDING_PAYMENT|
|       3|2013-01-25|  12111|       COMPLETE|
|       4|2013-01-25|   8827|         CLOSED|
|       5|2013-01-25|  11318|       COMPLETE|
|       6|2013-01-25|   7130|       COMPLETE|
|       7|2013-01-25|   4530|       COMPLETE|
|       8|2013-01-25|   2911|     PROCESSING|
|       9|2013-01-25|   5657|PENDING_PAYMENT|
|      10|2013-01-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [34]:
df_new.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [35]:
df_new_extra = df.withColumn("order_date_new",to_date("order_date","mm-dd-yyyy"))

In [36]:
df_new_extra.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_date_new: date (nullable = true)



In [37]:
! hadoop fs -cat /public/trendytech/datasets/orders_sample3.csv

1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,error,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,unknown,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT


In [38]:
orders_schema = 'order_id long, order_date string, cust_id long, order_status string'

In [39]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
load("/public/trendytech/datasets/orders_sample3.csv")

In [40]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   null|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   null|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [41]:
orders_schema = 'order_id long, order_date string, cust_id long, order_status string'

In [42]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
option("mode", "failfast").\
load("/public/trendytech/datasets/orders_sample3.csv")

In [43]:
df = spark.read.\
format("csv").\
schema(orders_schema).\
option("mode", "dropmalformed").\
load("/public/trendytech/datasets/orders_sample3.csv")

In [44]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [45]:
spark.stop()

In [46]:
spark.sql("show databases").filter("namespace like '%itv017244%'")

namespace
itv017244_assignm...
itv017244_retail


In [47]:
spark.sql("use itv017244_retail")

In [48]:
spark.sql("show tables").show()

+----------------+----------+-----------+
|        database| tableName|isTemporary|
+----------------+----------+-----------+
|itv017244_retail|    orders|      false|
|itv017244_retail|orders_ext|      false|
+----------------+----------+-----------+



In [49]:
orders_df = spark.sql("select * from itv017244_retail.orders_ext")

In [50]:
orders_df.show()

Py4JJavaError: An error occurred while calling o218.showString.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:750)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1506)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:407)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:398)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:485)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:439)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
df = spark.table("itv017244_retail.orders_ext")

In [None]:
df.show()

In [None]:
spark.range(5)

In [None]:
! hadoop fs  -cat /public/trendytech/retail_db/orders/part-00000 |head

In [None]:
orders_list = [(1,'2013-07-25 00:00:00.0',11599,'CLOSED'),
                 (2,'2013-07-25 00:00:00.0',256,'PENDING_PAYMENT'),
                 (3,'2013-07-25 00:00:00.0',12111,'COMPLETE')]

In [None]:
orders_raw_df = spark.createDataFrame(orders_list)

In [None]:
orders_raw_df.show()

In [None]:
orders_raw_df.printSchema()

In [None]:
orders_raw_df = spark.createDataFrame(orders_list).toDF('order_id','order_date','customer_id','order_status')

In [None]:
orders_raw_df.show()

In [None]:
orders_schema = ["order_id","order_date","cust_id","order_status"]

In [None]:
df = spark.createDataFrame(orders_list,orders_schema)

In [None]:
df.show()

In [None]:
orders_schema = 'order_id long, order_date string, cust_id int, order_status string'

In [None]:
df = spark.createDataFrame(orders_list,orders_schema)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import to_timestamp

In [None]:
new_df = df.withColumn("order_date",to_timestamp("order_date"))

In [None]:
new_df.show()

In [None]:
new_df.printSchema()

In [None]:
orders_rdd = spark.sparkContext.textFile("/public/trendytech/retail_db/orders/part-00000")

In [None]:
orders_rdd.take(4)

In [None]:
new_orders_rdd = orders_rdd.map(lambda x : (int(x.split(",")[0]),x.split(",")[1],int(x.split(",")[2]),x.split(",")[3]))

In [None]:
new_orders_rdd.take(4)

In [None]:
orders_schema = 'order_id long, order_date string, cust_id int, order_status string'

In [None]:
df = spark.createDataFrame(new_orders_rdd,orders_schema)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
new_df = spark.createDataFrame(new_orders_rdd).toDF('order_id','order_date','cust_id','order_status')

In [None]:
new_df.show()

In [None]:
orders_schema = 'order_id long, order_date string, cust_id int, order_status string'

In [None]:
df = new_orders_rdd.toDF(orders_schema)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
spark

In [None]:
! hadoop fs -ls /public/trendytech/datasets/customer/nested/

In [None]:
schemaddl = "customer_id long, fullname struct<firstname: string,lastname: string>,city string"

In [None]:
df = spark.read.format("json").schema(schemaddl).load("/public/trendytech/datasets/customer_nested/")

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.types import *

In [None]:
customer_schema = StructType([
                            StructField("customer_id",LongType()),
                            StructField("fullname",
                                        StructType([StructField("firstname",StringType()),
                                                    StructField("lastname", StringType())
                                                   ])
                                       ),
                            StructField("city", StringType())
])

In [None]:
df = spark.read.format("json").schema(customer_schema).load("/public/trendytech/datasets/customer_nested/")

In [None]:
df.show()