In [1]:
from pyspark.sql import SparkSession
import os

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"

## Creating SparkSession object

In [2]:
spark = SparkSession. \
    builder. \
    appName('Reading Data From text files'). \
    master('local'). \
    getOrCreate()

### Reading file using read.csv

In [3]:
filepath = 'datasets/orders/*'

In [4]:
orders_df = spark.read.csv(filepath)
orders_df.show()

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
|  4|2013-07-25 00:00:...| 8827|         CLOSED|
|  5|2013-07-25 00:00:...|11318|       COMPLETE|
|  6|2013-07-25 00:00:...| 7130|       COMPLETE|
|  7|2013-07-25 00:00:...| 4530|       COMPLETE|
|  8|2013-07-25 00:00:...| 2911|     PROCESSING|
|  9|2013-07-25 00:00:...| 5657|PENDING_PAYMENT|
| 10|2013-07-25 00:00:...| 5648|PENDING_PAYMENT|
| 11|2013-07-25 00:00:...|  918| PAYMENT_REVIEW|
| 12|2013-07-25 00:00:...| 1837|         CLOSED|
| 13|2013-07-25 00:00:...| 9149|PENDING_PAYMENT|
| 14|2013-07-25 00:00:...| 9842|     PROCESSING|
| 15|2013-07-25 00:00:...| 2568|       COMPLETE|
| 16|2013-07-25 00:00:...| 7276|PENDING_PAYMENT|
| 17|2013-07-25 00:00:...| 2667|       COMPLETE|
| 18|2013-07-25 00:0

In [5]:
orders_df.printSchema() #it has no column names or schema

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



### reading file using .csv but with formatting (column names and data types)

In [7]:
orders_schema = 'order_id int, order_date string, order_customer_id int, order_status string'
orders_formatted_df = spark.read.csv(filepath, sep=',', schema=orders_schema)
orders_formatted_df.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

In [8]:
orders_formatted_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### only provide column names and not other schema related values

In [11]:
orders_cols_df = spark.read.csv(filepath, sep=',').toDF('order_id', 'order_Date', 'order_cust_id', 'order_status')
orders_cols_df.show()

+--------+--------------------+-------------+---------------+
|order_id|          order_Date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
|       6|2013-07-25 00:00:...|         7130|       COMPLETE|
|       7|2013-07-25 00:00:...|         4530|       COMPLETE|
|       8|2013-07-25 00:00:...|         2911|     PROCESSING|
|       9|2013-07-25 00:00:...|         5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|         5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|          918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|         1837|         CLOSED|
|      13|2013-07-25 00:00:...|         9149|PENDING_PAYMENT|
|      1

In [12]:
orders_cols_df.printSchema() #will not change

root
 |-- order_id: string (nullable = true)
 |-- order_Date: string (nullable = true)
 |-- order_cust_id: string (nullable = true)
 |-- order_status: string (nullable = true)



### Cast to correct datatypes after reading the file using select statement

In [23]:
from pyspark.sql.types import IntegerType
order_cols_formatted = orders_cols_df.select(
    orders_cols_df.order_id.cast(IntegerType()),
    orders_cols_df.order_Date,
    orders_cols_df.order_cust_id.cast("int"),
    orders_cols_df.order_status
)

In [25]:
order_cols_formatted.show(5)

+--------+--------------------+-------------+---------------+
|order_id|          order_Date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
+--------+--------------------+-------------+---------------+
only showing top 5 rows



In [26]:
order_cols_formatted.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_Date: string (nullable = true)
 |-- order_cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### Cast to correct datatypes after reading the file using withCoumn statement
Since Select might be too tedious bcz we need to write all column which don't need typecasting as well

In [31]:
order_with_dataTypes = orders_cols_df. \
    withColumn('order_id', orders_cols_df.order_id.cast(IntegerType())). \
    withColumn('order_cust_id', orders_cols_df.order_cust_id.cast(IntegerType()))      

In [32]:
order_with_dataTypes.show(5)

+--------+--------------------+-------------+---------------+
|order_id|          order_Date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
+--------+--------------------+-------------+---------------+
only showing top 5 rows



In [33]:
order_with_dataTypes.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_Date: string (nullable = true)
 |-- order_cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### using spark.read.format

In [17]:
orders_df2 = spark. \
    read. \
    format('csv'). \
    option('sep', ','). \
    schema(orders_schema). \
    load(filepath)

In [18]:
orders_df2.show(5)  

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
+--------+--------------------+-----------------+---------------+
only showing top 5 rows



### Using spark.read.text

In [35]:
#all 4 columns will come as a single record in string format
order_text_df = spark.read.text(filepath)
order_text_df.show(5, truncate=False)

+-------------------------------------------+
|value                                      |
+-------------------------------------------+
|1,2013-07-25 00:00:00.0,11599,CLOSED       |
|2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT|
|3,2013-07-25 00:00:00.0,12111,COMPLETE     |
|4,2013-07-25 00:00:00.0,8827,CLOSED        |
|5,2013-07-25 00:00:00.0,11318,COMPLETE     |
+-------------------------------------------+
only showing top 5 rows



### Reading hive tables from spark
Hive should be up and running and configured with spark

In [None]:
hiveTable_df = spark.read.table('dbname.tablename')

#or

hive_table_sql = spark.sql('SELECT * FROM dbname.tablename')
