In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/opt/cloudera/parcels/Anaconda/bin/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/SPARK2-2.3.0.cloudera2-1.cdh5.13.3.p0.316101/lib/spark2/"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('demo').master("local").enableHiveSupport().getOrCreate()
spark

In [None]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("users.csv", format="csv", sep=",", inferSchema="true", header="true")


In [None]:
# Printing the schema 
df.printSchema()

In [None]:
# Showing the elements of the dataframe
df.show()

In [None]:
#Specifying the schema instead of inferring it 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, DoubleType, LongType

fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', LongType(),True),
                        StructField('job_title', StringType(),True)])

df2 = spark.read.load("users.csv", format="csv", sep=",", schema = fileSchema, header="true")


In [None]:
df2.printSchema()

In [None]:
df2.show()

In [None]:
#Storing this dataframe in parquet
df.write.parquet("users_2.parquet")

In [None]:
#Storing this dataframe in json format
df.write.json("users_2.json")

In [None]:
#Storing this dataframe in orc format
df.write.orc("users_2.orc")

In [None]:
# Creating dataframe from the JSON file and infering the schema
df=spark.read.json("users_2.json")

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
#Specifying the schema in the case of json
fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', IntegerType(),True),
                        StructField('job', StringType(),True)])

df2 = spark.read.json("users_2.json", schema = fileSchema) 


In [None]:
df2.printSchema()

In [None]:
df2.show()

In [None]:
# Reading the data from the parquet file
df = spark.read.parquet("users_2.parquet")

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
# Reading the data from the orc file
df = spark.read.orc("users_2.orc")

In [None]:
df.printSchema()

In [None]:
df.show()

In [24]:
# setting up the Pandas
import numpy as np
import pandas as pd

# Enable Arrow-based columnar data transfers
#spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [25]:
# Generating the pandas dataframe
pdf = pd.DataFrame(np.random.rand(10, 3))

In [26]:
print(pdf)

          0         1         2
0  0.941967  0.788411  0.647896
1  0.301035  0.863868  0.765032
2  0.824416  0.829005  0.118239
3  0.938813  0.576990  0.257522
4  0.000067  0.824416  0.633032
5  0.452181  0.331384  0.361250
6  0.297885  0.623081  0.734220
7  0.380458  0.763270  0.699136
8  0.659933  0.210418  0.696564
9  0.892335  0.111163  0.936980


In [27]:
# Generating the Data frames from the Pandas df
#!pip install pyarrow
df = spark.createDataFrame(pdf)

In [29]:
df.printSchema()
df.show()

root
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)

+--------------------+-------------------+------------------+
|                   0|                  1|                 2|
+--------------------+-------------------+------------------+
|   0.941967017531824| 0.7884106710442891|0.6478957837686703|
| 0.30103509302473985| 0.8638681761162179|0.7650321588194177|
|  0.8244161506430402| 0.8290051903795865|0.1182390449199483|
|  0.9388129227481385| 0.5769901140456242|0.2575220496960423|
|6.708929811183317E-5| 0.8244157959822621|0.6330319515096822|
|  0.4521805781549154|0.33138388195168833|0.3612499875455727|
|  0.2978852939161081| 0.6230806977080089|0.7342204127915506|
| 0.38045792434330783| 0.7632701370534474|0.6991361824284357|
|  0.6599326479266845|0.21041825867361974| 0.696564021682488|
|  0.8923346720942692|0.11116324499733143|0.9369804834622025|
+--------------------+-------------------+------------------+



In [30]:
# This need the install of PyArrow
#   -- Installing step !pip install pyarrow
result_pdf = df.select("*").toPandas()
print(result_pdf)

          0         1         2
0  0.941967  0.788411  0.647896
1  0.301035  0.863868  0.765032
2  0.824416  0.829005  0.118239
3  0.938813  0.576990  0.257522
4  0.000067  0.824416  0.633032
5  0.452181  0.331384  0.361250
6  0.297885  0.623081  0.734220
7  0.380458  0.763270  0.699136
8  0.659933  0.210418  0.696564
9  0.892335  0.111163  0.936980


In [None]:
######. operations on data frame 

In [31]:
### Select all columns in dataframe
df = spark.read.orc("users_2.orc")
df.select("*").show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [32]:
### Selecting specific column in dataframe
df.select("name").show()

+-------+
|   name|
+-------+
| Vishwa|
|  Mohan|
|Rishavv|
|Shivani|
| Sachin|
|  Rohit|
|  Virat|
| Akshay|
|Amitabh|
+-------+



In [33]:
# Filter operations
df.filter(df['age']>50).collect()

[Row(name=u'Vishwa', age=61, job=u'Engineer'),
 Row(name=u'Mohan', age=79, job=u'Doctor'),
 Row(name=u'Shivani', age=69, job=u'Consultant'),
 Row(name=u'Amitabh', age=70, job=u'Superstar')]

In [34]:
# Group by
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 31|    1|
| 61|    1|
| 35|    1|
| 69|    1|
| 45|    1|
| 70|    1|
| 21|    1|
| 32|    1|
| 79|    1|
+---+-----+



In [35]:
#Order by
df.orderBy(df.age.desc()).show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
|  Mohan| 79|    Doctor|
|Amitabh| 70| Superstar|
|Shivani| 69|Consultant|
| Vishwa| 61|  Engineer|
| Akshay| 45|     Actor|
| Sachin| 35| Cricketer|
|  Virat| 32|   Blogger|
|  Rohit| 31|   Captain|
|Rishavv| 21|   Student|
+-------+---+----------+



In [None]:
################################  Spark SQL   ##########################