In [1]:
import os

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
!pip install pyspark==3.1.1

Collecting pyspark==3.1.1
  Downloading pyspark-3.1.1.tar.gz (212.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.3/212.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.1.1)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767582 sha256=b73691cfa66e10d91f6db6f8221edc7b7c25956a0d0df6bffdc56e7b625f3482
  Stored in directory: /root/.cache/pip/wheels/a0/3f/72/8efd988f9ae041f051c75e6834cd92dd6d13a726e206e8b6f3
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10

In [5]:
import pyspark

In [6]:
import findspark

In [7]:
findspark .init()

In [8]:
from pyspark.sql import SparkSession

In [9]:
if __name__=="__main__":
  spark = SparkSession.builder \
      .appName("myapplication") \
      .master("local[*]") \
      .getOrCreate()

In [10]:
spark

In [12]:
from pyspark.sql import SparkSession

In [13]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DecimalType,DateType,TimestampType

In [14]:
tnx_schema=StructType([StructField("tnx_id",IntegerType()),
                       StructField("tnx_dt",DateType()),
                       StructField("cid",IntegerType()),
                       StructField("amount",DecimalType(10,2)),
                       StructField("prod_cat",StringType()),
                       StructField("prod",StringType()),
                       StructField("city",StringType()),
                       StructField("state",StringType()),
                       StructField("mode",StringType())])

In [16]:
tnx_df=spark.read\
           .option("header",True)\
           .schema(tnx_schema)\
           .option("dateFormat","MM-dd-yyyy")\
           .csv("/content/txns_with_header.csv")

In [17]:
tnx_df.show(5)

+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
|tnx_id|    tnx_dt|    cid|amount|          prod_cat|                prod|       city|     state|  mode|
+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
|     0|2011-06-26|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville| Tennessee|credit|
|     1|2011-05-26|4006742|198.44|Exercise & Fitness|Weightlifting Gloves| Long Beach|California|credit|
|     2|2011-06-01|4009775|  5.58|Exercise & Fitness|Weightlifting Mac...|    Anaheim|California|credit|
|     3|2011-06-05|4002199|198.19|        Gymnastics|    Gymnastics Rings|  Milwaukee| Wisconsin|credit|
|     4|2011-12-17|4002613| 98.81|       Team Sports|        Field Hockey|Nashville  | Tennessee|credit|
+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
only showing top 5 rows



In [20]:
tnx_df.write\
      .mode("overwrite")\
      .format("json")\
      .save("/content/data_json")

In [26]:
tnx_df.write\
      .mode("overwrite")\
      .format("parquet")\
      .save("/content/data_parquet")

In [27]:
spark.read.load("/content/data_parquet").show()

+------+----------+-------+------+--------------------+--------------------+--------------+--------------+------+
|tnx_id|    tnx_dt|    cid|amount|            prod_cat|                prod|          city|         state|  mode|
+------+----------+-------+------+--------------------+--------------------+--------------+--------------+------+
|     0|2011-06-26|4007024| 40.33|  Exercise & Fitness|Cardio Machine Ac...|   Clarksville|     Tennessee|credit|
|     1|2011-05-26|4006742|198.44|  Exercise & Fitness|Weightlifting Gloves|    Long Beach|    California|credit|
|     2|2011-06-01|4009775|  5.58|  Exercise & Fitness|Weightlifting Mac...|       Anaheim|    California|credit|
|     3|2011-06-05|4002199|198.19|          Gymnastics|    Gymnastics Rings|     Milwaukee|     Wisconsin|credit|
|     4|2011-12-17|4002613| 98.81|         Team Sports|        Field Hockey|   Nashville  |     Tennessee|credit|
|     5|2011-02-14|4007591|193.63|  Outdoor Recreation|Camping & Backpac...|       Chica

In [28]:
tnx_parquet_df=spark.read.load("/content/data_parquet")

In [29]:
tnx_parquet_df.printSchema()

root
 |-- tnx_id: integer (nullable = true)
 |-- tnx_dt: date (nullable = true)
 |-- cid: integer (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- prod_cat: string (nullable = true)
 |-- prod: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- mode: string (nullable = true)



In [30]:
tnx_parquet_df.show(2)

+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
|tnx_id|    tnx_dt|    cid|amount|          prod_cat|                prod|       city|     state|  mode|
+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
|     0|2011-06-26|4007024| 40.33|Exercise & Fitness|Cardio Machine Ac...|Clarksville| Tennessee|credit|
|     1|2011-05-26|4006742|198.44|Exercise & Fitness|Weightlifting Gloves| Long Beach|California|credit|
+------+----------+-------+------+------------------+--------------------+-----------+----------+------+
only showing top 2 rows



for reading json rows are not propely read,schema is also not propery read

```
# This is formatted as code
```



In [31]:
json_format_read=spark.read.format("json").option("header",True).option("inferSchema",True).load("/content/data_json")

In [32]:
json_format_read.printSchema()

root
 |-- amount: double (nullable = true)
 |-- cid: long (nullable = true)
 |-- city: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- prod: string (nullable = true)
 |-- prod_cat: string (nullable = true)
 |-- state: string (nullable = true)
 |-- tnx_dt: string (nullable = true)
 |-- tnx_id: long (nullable = true)



In [34]:
json_format_read.show()

+------+-------+--------------+------+--------------------+--------------------+--------------+----------+------+
|amount|    cid|          city|  mode|                prod|            prod_cat|         state|    tnx_dt|tnx_id|
+------+-------+--------------+------+--------------------+--------------------+--------------+----------+------+
| 40.33|4007024|   Clarksville|credit|Cardio Machine Ac...|  Exercise & Fitness|     Tennessee|2011-06-26|     0|
|198.44|4006742|    Long Beach|credit|Weightlifting Gloves|  Exercise & Fitness|    California|2011-05-26|     1|
|  5.58|4009775|       Anaheim|credit|Weightlifting Mac...|  Exercise & Fitness|    California|2011-06-01|     2|
|198.19|4002199|     Milwaukee|credit|    Gymnastics Rings|          Gymnastics|     Wisconsin|2011-06-05|     3|
| 98.81|4002613|   Nashville  |credit|        Field Hockey|         Team Sports|     Tennessee|2011-12-17|     4|
|193.63|4007591|       Chicago|credit|Camping & Backpac...|  Outdoor Recreation|      Il