In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv020649/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orderDf = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/yelp-dataset/yelp_user.csv")

In [3]:
orderDf.show()

+--------------------+-------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|             user_id|   name|review_count|yelping_since|             friends|useful|funny|cool|fans|elite|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|
+--------------------+-------+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+
|JJ-aSuM4pCFPdkfoZ...|  Chris|          10|   2013-09-24|0njfJmB-7n84DlIgU...|    

In [4]:
orderDf.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



In [5]:
#using sampling ratio
orderDf = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("samplingRatio",.001) \
.option("header","true") \
.load("/public/yelp-dataset/yelp_user.csv")

In [6]:
orderDf.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- fans: integer (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- compliment_hot: integer (nullable = true)
 |-- compliment_more: integer (nullable = true)
 |-- compliment_profile: integer (nullable = true)
 |-- compliment_cute: integer (nullable = true)
 |-- compliment_list: integer (nullable = true)
 |-- compliment_note: integer (nullable = true)
 |-- compliment_plain: integer (nullable = true)
 |-- compliment_cool: integer (nullable = true)
 |-- compliment_funny: integer (nullable = true)
 |-- compliment_writer: integer (nullable = true)
 |-- compliment_photos: integer (nullable = true)



#### enforcing schema

In [7]:
order_schema = 'order_id long,order_date date,cust_id long,order_status string'

In [8]:
orderDf = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/datasets/orders_sample1.csv")

#### 2 ways to infer schema
1. order_schema = 'order_id long,order_date date,cust_id long,order_status string'  (for data type mismatch it will show NULL)
2. StructType 
StructType([
structField("orderid",LongType()),
structField("orderdate",date()),
structField("custid",IntegerType()),
structField("orderid",StringType())
])

In [9]:
orderDf.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [10]:
orderDf.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [11]:
from pyspark.sql.types import *

In [12]:
orderSchemaStruct = StructType([
StructField("orderid",LongType()),
StructField("orderdate",DateType()),
StructField("custid",IntegerType()),
StructField("orderstatus",StringType()),
])

In [13]:
orderDf = spark.read \
.format("csv") \
.schema(orderSchemaStruct) \
.load("/public/trendytech/datasets/orders_sample1.csv")
#date format yyyy-mm-dd

### Deal with Date type

In [14]:
orderDf.printSchema()

root
 |-- orderid: long (nullable = true)
 |-- orderdate: date (nullable = true)
 |-- custid: integer (nullable = true)
 |-- orderstatus: string (nullable = true)



In [15]:
orderDf = spark.read \
.format("csv") \
.schema(orderSchemaStruct) \
.load("/public/trendytech/datasets/orders_sample2.csv")
#date format dd-mm-yyyy

In [16]:
! hadoop fs -cat /public/trendytech/datasets/orders_sample2.csv|head

1,07-25-2013,11599,CLOSED
2,07-25-2013,256,PENDING_PAYMENT
3,07-25-2013,12111,COMPLETE
4,07-25-2013,8827,CLOSED
5,07-25-2013,11318,COMPLETE
6,07-25-2013,7130,COMPLETE
7,07-25-2013,4530,COMPLETE
8,07-25-2013,2911,PROCESSING
9,07-25-2013,5657,PENDING_PAYMENT
10,07-25-2013,5648,PENDING_PAYMENT


In [17]:
order_schema = 'order_id long,order_date date,cust_id long,order_status string'

In [18]:
orderDf = spark.read \
.format("csv") \
.schema(orderSchemaStruct) \
.option("dateFormat","mm-dd-yyyy") \
.load("/public/trendytech/datasets/orders_sample2.csv")
#date format dd-mm-yyyy

#### date can have different and confusing behabior so parse as string and later

In [19]:
orderDf.show()

+-------+----------+------+---------------+
|orderid| orderdate|custid|    orderstatus|
+-------+----------+------+---------------+
|      1|2013-01-25| 11599|         CLOSED|
|      2|2013-01-25|   256|PENDING_PAYMENT|
|      3|2013-01-25| 12111|       COMPLETE|
|      4|2013-01-25|  8827|         CLOSED|
|      5|2013-01-25| 11318|       COMPLETE|
|      6|2013-01-25|  7130|       COMPLETE|
|      7|2013-01-25|  4530|       COMPLETE|
|      8|2013-01-25|  2911|     PROCESSING|
|      9|2013-01-25|  5657|PENDING_PAYMENT|
|     10|2013-01-25|  5648|PENDING_PAYMENT|
+-------+----------+------+---------------+



In [20]:
order_schema = 'order_id long,order_date string,cust_id long,order_status string'

In [21]:
orderDf = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/datasets/orders_sample2.csv")

In [22]:
orderDf.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|07-25-2013|  11599|         CLOSED|
|       2|07-25-2013|    256|PENDING_PAYMENT|
|       3|07-25-2013|  12111|       COMPLETE|
|       4|07-25-2013|   8827|         CLOSED|
|       5|07-25-2013|  11318|       COMPLETE|
|       6|07-25-2013|   7130|       COMPLETE|
|       7|07-25-2013|   4530|       COMPLETE|
|       8|07-25-2013|   2911|     PROCESSING|
|       9|07-25-2013|   5657|PENDING_PAYMENT|
|      10|07-25-2013|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [23]:
orderDf.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [24]:
from pyspark.sql.functions import to_date

#### WithColumn used to change current column or change existing column

In [25]:
new_df = orderDf.withColumn("order_date_new",to_date("order_date","mm-dd-yyyy"))

In [26]:
new_df.show()

+--------+----------+-------+---------------+--------------+
|order_id|order_date|cust_id|   order_status|order_date_new|
+--------+----------+-------+---------------+--------------+
|       1|07-25-2013|  11599|         CLOSED|    2013-01-25|
|       2|07-25-2013|    256|PENDING_PAYMENT|    2013-01-25|
|       3|07-25-2013|  12111|       COMPLETE|    2013-01-25|
|       4|07-25-2013|   8827|         CLOSED|    2013-01-25|
|       5|07-25-2013|  11318|       COMPLETE|    2013-01-25|
|       6|07-25-2013|   7130|       COMPLETE|    2013-01-25|
|       7|07-25-2013|   4530|       COMPLETE|    2013-01-25|
|       8|07-25-2013|   2911|     PROCESSING|    2013-01-25|
|       9|07-25-2013|   5657|PENDING_PAYMENT|    2013-01-25|
|      10|07-25-2013|   5648|PENDING_PAYMENT|    2013-01-25|
+--------+----------+-------+---------------+--------------+



In [27]:
new_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_date_new: date (nullable = true)



#### in case of data mis match it will give null

In [28]:
order_schema = 'order_id string,order_date string,cust_id int,order_status string'

In [29]:
orderDf = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/datasets/orders_sample3.csv")

In [30]:
orderDf.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   null|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   null|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



## Read Modes

Failfast->it fail when parssing issue
, permissive(default)->it will parse but for bad data it will give null
, dropmalformed-> drop mal formed record

In [31]:
orderDf = spark.read \
.format("csv") \
.schema(order_schema) \
.option("mode","dropmalformed") \
.load("/public/trendytech/datasets/orders_sample3.csv")

In [32]:
orderDf.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
+--------+----------+-------+---------------+



In [33]:
orderDf = spark.read \
.format("csv") \
.schema(order_schema) \
.option("mode","failfast") \
.load("/public/trendytech/datasets/orders_sample3.csv")

In [34]:
orderDf.show()

Py4JJavaError: An error occurred while calling o176.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 4 times, most recent failure: Lost task 0.3 in stage 11.0 (TID 34) (w01.itversity.com executor 2): org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:400)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "error"
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:309)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:254)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:396)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 23 more
Caused by: java.lang.NumberFormatException: For input string: "error"
	at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
	at java.lang.Integer.parseInt(Integer.java:580)
	at java.lang.Integer.parseInt(Integer.java:615)
	at scala.collection.immutable.StringLike.toInt(StringLike.scala:304)
	at scala.collection.immutable.StringLike.toInt$(StringLike.scala:304)
	at scala.collection.immutable.StringOps.toInt(StringOps.scala:33)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:157)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:157)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:238)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:157)
	at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:291)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
