In [62]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.types import StructType, StructField, StringType
import os
import time

In [22]:
#Spark Session using local(Standalone)
spark = SparkSession.builder \
     .master("local[*]") \
     .appName("Pyspark SQL") \
     .config("conf.ui.port", "50032") \
     .getOrCreate()

In [27]:
sc = spark.sparkContext
sc.setLogLevel("WARN")

In [69]:
#Word count script
#Read file and use less apis and it converts it into collection
#Then convert collection to RDD and Use coalesce to make it as one partition and save as file
#If you don't use it creates multiple files and also takes more time
#By using coalesce you can reduce processing time almost 40-45%
t = time.time()
f = sc.textFile("data/file.txt")
result = f.flatMap(lambda line: line.split(" ")).countByValue()
sc.parallelize(list(result.items())).coalesce(1).saveAsTextFile("output/word_cnt")
print("File completed:",time.time()-t)

File completed: 0.7937862873077393


In [70]:
#Other way to implement word count
#coalesce saves almost 50% of time
t = time.time()
f = sc.textFile("data/file.txt")
result = f.flatMap(lambda line: line.split(" ")).map(lambda words:(words,1)).reduceByKey(lambda a,b:a+b)
result.coalesce(1).saveAsTextFile("output/word_cnt2")
print("File completed:",time.time()-t)                                         

File completed: 0.9056434631347656


In [24]:
df = spark.read.json("data/emp.json")

In [25]:
def convert_empdf_collection(df):
    l = []
    for row in df.collect():
        for i in range(0,len(row.empno)):
            l.append(
                (row.empno[i],
                 row.ename[i],
                 row.job[i],
                 row.mgr[i],
                 row.hiredate[i],
                 row.sal[i],
                 row.comm[i],
                 row.deptno[i])
            )
    return l

In [26]:
emp_t = convert_empdf_collection(df)
emp = spark.createDataFrame(emp_t,("empno","ename","job","mgr","hiredate","sal","comm","deptno"))
emp.show()

+-----+------+---------+------+----------+------+------+------+
|empno| ename|      job|   mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+------+----------+------+------+------+
| 7369| SMITH|    CLERK|7902.0|1980-12-17| 800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698.0|1981-02-20|1600.0| 300.0|    30|
| 7876| ADAMS|    CLERK|7788.0|1983-01-12|1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698.0|1981-12-03| 950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566.0|1981-12-03|3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782.0|1982-01-23|1300.0|  null|    10|
| 7521|  WARD| SALESMAN|7698.0|1981-02-22|1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839.0|1981-04-02|2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698.0|1981-09-28|1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839.0|1981-05-01|2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839.0|1981-06-09|2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566.0|1982-12-09|3000.0|  null|    20|
| 7839|  KING|PRESIDENT|  null|1981-11-1

In [17]:
emp[emp.deptno == 20].show()

+-----+-----+-------+------+----------+------+----+------+
|empno|ename|    job|   mgr|  hiredate|   sal|comm|deptno|
+-----+-----+-------+------+----------+------+----+------+
| 7369|SMITH|  CLERK|7902.0|1980-12-17| 800.0|null|    20|
| 7876|ADAMS|  CLERK|7788.0|1983-01-12|1100.0|null|    20|
| 7902| FORD|ANALYST|7566.0|1981-12-03|3000.0|null|    20|
| 7566|JONES|MANAGER|7839.0|1981-04-02|2975.0|null|    20|
| 7788|SCOTT|ANALYST|7566.0|1982-12-09|3000.0|null|    20|
+-----+-----+-------+------+----------+------+----+------+



In [18]:
emp_p = spark.read.parquet("data/emp.parquet")
emp_p

DataFrame[empno: bigint, ename: string, job: string, mgr: double, hiredate: string, sal: double, comm: double, deptno: bigint]

In [19]:
emp_p[emp_p.deptno == 30].show()

+-----+------+--------+------+----------+------+------+------+
|empno| ename|     job|   mgr|  hiredate|   sal|  comm|deptno|
+-----+------+--------+------+----------+------+------+------+
| 7499| ALLEN|SALESMAN|7698.0|1981-02-20|1600.0| 300.0|    30|
| 7521|  WARD|SALESMAN|7698.0|1981-02-22|1250.0| 500.0|    30|
| 7654|MARTIN|SALESMAN|7698.0|1981-09-28|1250.0|1400.0|    30|
| 7698| BLAKE| MANAGER|7839.0|1981-05-01|2850.0|  null|    30|
| 7844|TURNER|SALESMAN|7698.0|1981-09-08|1500.0|   0.0|    30|
| 7900| JAMES|   CLERK|7698.0|1981-12-03| 950.0|  null|    30|
+-----+------+--------+------+----------+------+------+------+



In [20]:
#to stop spark session
spark.stop()