In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import window,column,desc,col,instr,expr, pow,translate,lit
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp, from_unixtime
import pandas as pd

spark = SparkSession \
    .builder \
    .appName("Foo") \
    .config("spark.executor.memory", "1g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()

## Creating a dataframe


### From row object

In [2]:
user_age=Row("Name","Age")

data=[user_age("sab",21),user_age("Ran",34)]
user_df = spark.createDataFrame(data)
user_df.show()

+----+---+
|Name|Age|
+----+---+
| sab| 21|
| Ran| 34|
+----+---+



### From list of tuples

In [3]:
mylist = [('Alice', 1), ('Bob', 2), ('Mary', 3),("Dab",9)]
df2 = spark.createDataFrame(mylist, ["Name" ,"Age"])
df2.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice|  1|
|  Bob|  2|
| Mary|  3|
|  Dab|  9|
+-----+---+



### From rdd 

In [4]:
rdd = spark.sparkContext.parallelize([(1,2,3),(4,5,6),(7,8,9)])
df = rdd.toDF(["a","b","c"])
df.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  4|  5|  6|
|  7|  8|  9|
+---+---+---+



## Working on a dataframe

In [5]:
pdf = spark.read.format("csv").option("header", "true").option("inferSchema", "true").option("delimiter", ",")\
.option("dateFormat", "MM/dd/YYYY HH:mm")\
.load("C:/Users/ramya/Desktop/Santa Clara University/Q3/PySpark/retail.csv")

pdf.head()

Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom')

In [6]:
pdf.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [4]:
pdf.select(
    'InvoiceDate', 
    from_unixtime(unix_timestamp('InvoiceDate', 'mm/dd/YYYY HH:mm')).alias('date')
).show(4)

+--------------+-------------------+
|   InvoiceDate|               date|
+--------------+-------------------+
|12/1/2010 8:26|2009-12-27 08:26:00|
|12/1/2010 8:26|2009-12-27 08:26:00|
|12/1/2010 8:26|2009-12-27 08:26:00|
|12/1/2010 8:26|2009-12-27 08:26:00|
+--------------+-------------------+
only showing top 4 rows



In [5]:
pdf.show(3)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 3 rows



In [6]:
from pyspark.sql.functions import countDistinct
pdf.groupBy(pdf.Quantity).agg(countDistinct("Country"),min("UnitPrice")).withColumnRenamed("min(UnitPrice)","miu").show(5)

+--------+-----------------------+----+
|Quantity|count(DISTINCT Country)| miu|
+--------+-----------------------+----+
|      34|                      1|1.66|
|      -1|                      2|1.45|
|      28|                      1|0.21|
|      27|                      1|1.45|
|     384|                      1|2.95|
+--------+-----------------------+----+
only showing top 5 rows



In [27]:
pdf.groupBy(pdf.Quantity).agg(countDistinct("Country"),min("UnitPrice")).\
withColumnRenamed("min(UnitPrice)","miu").filter(col("Quantity")>0).count()

46

In [8]:
pdf.select([count(when( col(c).isNull(), c)).alias(c) for c in pdf.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|        0|        0|         10|       0|          0|        0|      1140|      0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [9]:
pdf.count()

3108

In [10]:
pdf.na.drop("any",subset=["Description","CustomerID"]).count()

1968

In [11]:
pdf=pdf.na.drop("any",subset=["Description","CustomerID"])

In [12]:
pdf.select([countDistinct(c).alias(c) for c in pdf.columns]).show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|      127|      946|        948|      55|        113|       69|        98|      7|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [13]:
pdf.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [14]:
pdf=pdf.withColumn("hu",round(pdf.Quantity+pdf.UnitPrice/3,3))

In [18]:
pdf.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- hu: double (nullable = true)



In [16]:
pdf.show(2)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|  hu|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|6.85|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|7.13|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
only showing top 2 rows



In [17]:
from pyspark.sql.types import IntegerType
pdf = pdf.withColumn("InvoiceNo", pdf["InvoiceNo"].cast(IntegerType()))

In [22]:
pdf.groupBy("Country").agg(max("InvoiceNo").alias("123"),\
                           countDistinct("Description"),count("InvoiceDate").alias("total_count")).show()

+--------------+------+---------------------------+-----------+
|       Country|   123|count(DISTINCT Description)|total_count|
+--------------+------+---------------------------+-----------+
|       Germany|536527|                         27|         29|
|        France|536370|                         20|         20|
|          EIRE|536541|                         21|         21|
|        Norway|536532|                         73|         73|
|     Australia|536389|                         14|         14|
|United Kingdom|536597|                        899|       1809|
|   Netherlands|536403|                          2|          2|
+--------------+------+---------------------------+-----------+



In [23]:
pdf.createOrReplaceTempView("temp_view")

In [26]:
spark.sql("select * from temp_view").show(2)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|  hu|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|6.85|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|7.13|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
only showing top 2 rows



In [29]:
pdf.orderBy(pdf.UnitPrice.desc()).show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|    hu|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------+
|   536392|    22827|RUSTIC  SEVENTEEN...|       1|12/1/2010 10:29|    165.0|     13705|United Kingdom|  56.0|
|   536540|       C2|            CARRIAGE|       1|12/1/2010 14:05|     50.0|     14911|          EIRE|17.667|
|   536396|    22803|IVORY EMBROIDERED...|       2|12/1/2010 10:51|    35.75|     17850|United Kingdom|13.917|
|   536406|    22803|IVORY EMBROIDERED...|       2|12/1/2010 11:33|    35.75|     17850|United Kingdom|13.917|
|   536569|    21761|WOOD AND GLASS ME...|       1|12/1/2010 15:35|    29.95|     16274|United Kingdom|10.983|
|     null|        D|            Discount|      -1| 12/1/2010 9:41|     27.5|     14527|United Kingdom| 8.167|
|

In [33]:
pdf.sort(pdf.UnitPrice.desc()).show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|    hu|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------+
|   536392|    22827|RUSTIC  SEVENTEEN...|       1|12/1/2010 10:29|    165.0|     13705|United Kingdom|  56.0|
|   536540|       C2|            CARRIAGE|       1|12/1/2010 14:05|     50.0|     14911|          EIRE|17.667|
|   536406|    22803|IVORY EMBROIDERED...|       2|12/1/2010 11:33|    35.75|     17850|United Kingdom|13.917|
|   536396|    22803|IVORY EMBROIDERED...|       2|12/1/2010 10:51|    35.75|     17850|United Kingdom|13.917|
|   536569|    21761|WOOD AND GLASS ME...|       1|12/1/2010 15:35|    29.95|     16274|United Kingdom|10.983|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------+
o

In [34]:
pdf.groupBy("StockCode").agg(max("Quantity")).show(2)

+---------+-------------+
|StockCode|max(Quantity)|
+---------+-------------+
|    22728|           24|
|    21889|           24|
+---------+-------------+
only showing top 2 rows

