In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=abbd8625562b96e796914495e005754722a75e965e3bbfb80de19aa817613cc4
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
from pyspark.sql import SparkSession


In [3]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [4]:
spark=SparkSession.builder.getOrCreate()

Create DataFrame from JSON file and look at the Schema

In [5]:
df = spark.read.format("json").load("/content/details/data/2015-summary.json")
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [6]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [7]:
spark.read.format("json").load("/content/details/data/2015-summary.json").schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

In [8]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [9]:
myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False)
])

In [10]:
df1 = spark.read.format("json").schema(myManualSchema).load("/content/details/data/2015-summary.json")

In [11]:
df1.show(truncate=False)

+------------------------+-------------------+-----+
|DEST_COUNTRY_NAME       |ORIGIN_COUNTRY_NAME|count|
+------------------------+-------------------+-----+
|United States           |Romania            |15   |
|United States           |Croatia            |1    |
|United States           |Ireland            |344  |
|Egypt                   |United States      |15   |
|United States           |India              |62   |
|United States           |Singapore          |1    |
|United States           |Grenada            |62   |
|Costa Rica              |United States      |588  |
|Senegal                 |United States      |40   |
|Moldova                 |United States      |1    |
|United States           |Sint Maarten       |325  |
|United States           |Marshall Islands   |39   |
|Guyana                  |United States      |64   |
|Malta                   |United States      |1    |
|Anguilla                |United States      |41   |
|Bolivia                 |United States      |

In [12]:
df.createOrReplaceTempView("data")

In [13]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM data group by DEST_COUNTRY_NAME
""")

In [14]:
sqlWay.count()

132

The above query will give you the distint DEST_COUNTRY_NAME

In [15]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#8], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#8, 200), ENSURE_REQUIREMENTS, [plan_id=129]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#8], functions=[partial_count(1)])
         +- FileScan json [DEST_COUNTRY_NAME#8] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/content/details/data/2015-summary.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [16]:
dataFrameWay = df.groupBy("DEST_COUNTRY_NAME").count()

In [17]:
dataFrameWay.show()

+--------------------+-----+
|   DEST_COUNTRY_NAME|count|
+--------------------+-----+
|            Anguilla|    1|
|              Russia|    1|
|            Paraguay|    1|
|             Senegal|    1|
|              Sweden|    1|
|            Kiribati|    1|
|              Guyana|    1|
|         Philippines|    1|
|            Djibouti|    1|
|            Malaysia|    1|
|           Singapore|    1|
|                Fiji|    1|
|              Turkey|    1|
|                Iraq|    1|
|             Germany|    1|
|              Jordan|    1|
|               Palau|    1|
|Turks and Caicos ...|    1|
|              France|    1|
|              Greece|    1|
+--------------------+-----+
only showing top 20 rows



In [18]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#8], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#8, 200), ENSURE_REQUIREMENTS, [plan_id=187]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#8], functions=[partial_count(1)])
         +- FileScan json [DEST_COUNTRY_NAME#8] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/content/details/data/2015-summary.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




Here the both commands will give the unique DIST_COUNTRY_NAME

In [19]:
spark.sql("SELECT max(count) from data").show()

+----------+
|max(count)|
+----------+
|    370002|
+----------+



In [20]:
from pyspark.sql.functions import max

In [21]:
df.select(max("count")).show()

+----------+
|max(count)|
+----------+
|    370002|
+----------+



Here both the functions will give the maximum number of flights to and from any given location

In [22]:
max_dist=spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM data
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

In [23]:
max_dist.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



Manipulate DataFrame columns

In [24]:
df.select("DEST_COUNTRY_NAME").show(10)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|    United States|
|    United States|
|    United States|
|       Costa Rica|
|          Senegal|
|          Moldova|
+-----------------+
only showing top 10 rows



In [25]:
df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(5)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
|    United States|            Ireland|
|            Egypt|      United States|
|    United States|              India|
+-----------------+-------------------+
only showing top 5 rows



In [26]:
from pyspark.sql.functions import expr, col

In [27]:
df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"))\
  .show(2)

+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+
|    United States|    United States|
|    United States|    United States|
+-----------------+-----------------+
only showing top 2 rows



In [28]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [29]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [30]:
df.select(expr("avg(count)")).show(2)

+-----------+
| avg(count)|
+-----------+
|1770.765625|
+-----------+



In [31]:
data1 = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/content/details/data/2010-12-01.csv")

In [32]:
data1.show(truncate=False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |22752

In [33]:
data1.select(expr("max(Quantity)")).show()

+-------------+
|max(Quantity)|
+-------------+
|          600|
+-------------+



In [34]:
data1.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [35]:
from pyspark.sql.functions import struct
complexDF = data1.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.show(truncate=False)

+---------------------------------------------+
|complex                                      |
+---------------------------------------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365} |
|{WHITE METAL LANTERN, 536365}                |
|{CREAM CUPID HEARTS COAT HANGER, 536365}     |
|{KNITTED UNION FLAG HOT WATER BOTTLE, 536365}|
|{RED WOOLLY HOTTIE WHITE HEART., 536365}     |
|{SET 7 BABUSHKA NESTING BOXES, 536365}       |
|{GLASS STAR FROSTED T-LIGHT HOLDER, 536365}  |
|{HAND WARMER UNION JACK, 536366}             |
|{HAND WARMER RED POLKA DOT, 536366}          |
|{ASSORTED COLOUR BIRD ORNAMENT, 536367}      |
|{POPPY'S PLAYHOUSE BEDROOM , 536367}         |
|{POPPY'S PLAYHOUSE KITCHEN, 536367}          |
|{FELTCRAFT PRINCESS CHARLOTTE DOLL, 536367}  |
|{IVORY KNITTED MUG COSY , 536367}            |
|{BOX OF 6 ASSORTED COLOUR TEASPOONS, 536367} |
|{BOX OF VINTAGE JIGSAW BLOCKS , 536367}      |
|{BOX OF VINTAGE ALPHABET BLOCKS, 536367}     |
|{HOME BUILDING BLOCK WORD, 536367}     

In [36]:
complexDF.createOrReplaceTempView("complexDF")

In [37]:
data1.show(truncate=False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |22752

In [38]:
data1.select('StockCode','Quantity').show(5)

+---------+--------+
|StockCode|Quantity|
+---------+--------+
|   85123A|       6|
|    71053|       6|
|   84406B|       8|
|   84029G|       6|
|   84029E|       6|
+---------+--------+
only showing top 5 rows



In [61]:
dups=data1.groupBy(data1.columns).count().filter(col("count") > 1)
dups.count()



42

In [62]:
# there are  42 duplicate record in the data1 dataframe. Romoving the duplicates from the data1 and creating
# a new dataframe with name data1 again
data1=data1.dropDuplicates()

In [63]:
data1.count()

3064

In [64]:
data1.groupBy(data1.columns).count().filter(col("count") > 1).count()

0

In [71]:
data1.filter(data1.Description.isNull()).show()

+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-------------------+---------+----------+--------------+
|   536549|   85226A|       null|       1|2010-12-01 14:34:00|      0.0|      null|United Kingdom|
|   536545|    21134|       null|       1|2010-12-01 14:32:00|      0.0|      null|United Kingdom|
|   536550|    85044|       null|       1|2010-12-01 14:34:00|      0.0|      null|United Kingdom|
|   536552|    20950|       null|       1|2010-12-01 14:34:00|      0.0|      null|United Kingdom|
|   536553|    37461|       null|       3|2010-12-01 14:35:00|      0.0|      null|United Kingdom|
|   536547|    37509|       null|       1|2010-12-01 14:33:00|      0.0|      null|United Kingdom|
|   536554|    84670|       null|      23|2010-12-01 14:35:00|      0.0|      null|United Kingdom|
|   536589

In [40]:
from os import truncate
data1.withColumn('Complex',struct("Description", "InvoiceNo")).show(truncate=False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+---------------------------------------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |Complex                                      |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+---------------------------------------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|{WHITE HANGING HEART T-LIGHT HOLDER, 536365} |
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|{WHITE METAL LANTERN, 536365}                |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|{CREAM CUPID HEARTS COAT HANGER, 

In [41]:
from pyspark.sql.functions import lit,explode,date_format
df1=data1.withColumn('list',lit([2,3]))

In [42]:
from pyspark.sql.functions import rand
data1.withColumn('Random',rand()).show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|              Random|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|  0.9556594561909058|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom| 0.05908408150789035|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom| 0.32918436434383813|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|   0.837095469754971|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|Unite

In [43]:
df1.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|  list|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|[2, 3]|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|[2, 3]|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|[2, 3]|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|[2, 3]|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|[2, 3]|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65

In [44]:
df1.withColumn('list',explode(col('list'))).show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|list|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|   2|
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|   3|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|   2|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|   3|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|   2|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|Unit

In [45]:
df1.withColumn('InvoiceDate',date_format('InvoiceDate', 'yyyy-MM-dd')).withColumn("time", date_format(col("InvoiceDate"), "HH:mm:ss")).show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+------+--------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|  list|    time|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+------+--------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|   17850.0|United Kingdom|[2, 3]|00:00:00|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|   17850.0|United Kingdom|[2, 3]|00:00:00|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|   17850.0|United Kingdom|[2, 3]|00:00:00|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|   17850.0|United Kingdom|[2, 3]|00:00:00|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|   17850.0|United Kingdom|[2, 3]|00:00:00|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65

In [46]:
data1.groupBy("InvoiceNo").avg("UnitPrice").show(5)
data1.groupBy("InvoiceNo").sum("UnitPrice").show(5)
data1.groupBy("InvoiceNo").max("UnitPrice").show(5)


+---------+-----------------+
|InvoiceNo|   avg(UnitPrice)|
+---------+-----------------+
|   536596|5.723333333333334|
|   536597|2.361071428571428|
|   536414|              0.0|
|   536550|              0.0|
|   536460|3.548571428571429|
+---------+-----------------+
only showing top 5 rows

+---------+-----------------+
|InvoiceNo|   sum(UnitPrice)|
+---------+-----------------+
|   536596|            34.34|
|   536597|66.10999999999999|
|   536414|              0.0|
|   536550|              0.0|
|   536460|49.68000000000001|
+---------+-----------------+
only showing top 5 rows

+---------+--------------+
|InvoiceNo|max(UnitPrice)|
+---------+--------------+
|   536596|         19.95|
|   536597|          9.95|
|   536414|           0.0|
|   536550|           0.0|
|   536460|          7.95|
+---------+--------------+
only showing top 5 rows



In [47]:
data1.select("InvoiceNo").distinct().count()

143

In [48]:
data1.select("InvoiceNo").count()

3108

In [49]:
data1.select("CustomerId").distinct().count()

99

In [50]:
from pyspark.sql.functions import avg

In [51]:
data1.groupBy("Country","CustomerID").avg("Quantity","UnitPrice").show()

+--------------+----------+------------------+------------------+
|       Country|CustomerID|     avg(Quantity)|    avg(UnitPrice)|
+--------------+----------+------------------+------------------+
|United Kingdom|   17420.0| 7.571428571428571|  5.57142857142857|
|United Kingdom|   15922.0|12.545454545454545| 4.409090909090909|
|United Kingdom|   16250.0| 7.714285714285714| 3.376428571428572|
|United Kingdom|   13065.0| 5.285714285714286| 5.222142857142858|
|United Kingdom|   18074.0|14.615384615384615|4.7807692307692315|
|United Kingdom|   16048.0|              18.0|1.6212499999999999|
|       Germany|   12472.0|-2.857142857142857|3.5321428571428575|
|United Kingdom|   18085.0| 8.666666666666666|3.8444444444444446|
|United Kingdom|   17905.0| 3.130434782608696|4.7782608695652184|
|United Kingdom|   17841.0| 2.925373134328358|3.8005970149253705|
|United Kingdom|   15291.0|              56.0|               3.0|
|United Kingdom|   17951.0|              23.6|               4.4|
|United Ki

In [52]:
data1.select("Country").distinct().count()

7

In [53]:
from pyspark.sql.functions import avg, col
data1.filter(col('UnitPrice') > data1.select(avg('UnitPrice')).first()[0]).show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|2010-12-01 08:26:00|     4.25|   17850.0|United Kingdom|
|   536367|    84969|BOX OF 6 ASSORTED...|       6|2010-12-01 08:34:00|     4.25|   13047.0|United Kingdom|
|   536367|    22623|BOX OF VINTAGE JI...|       3|2010-12-01 08:34:00|     4.95|   13047.0|United Kingdom|
|   536367|    22622|BOX OF VINTAGE AL...|       2|2010-12-01 08:34:00|     9.95|   13047.0|United Kingdom|
|   536367|    21754|HOME BUILDING BLO...|       3|2010-12-01 08:34:00|     5.95|   13047.0|United Kingdom|
|   536367|    21755|LOVE BU

In [54]:
data1.withColumn('Quantity',col("Quantity")*2).show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|      12|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|      12|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|      16|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|      12|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|      12|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       4|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [55]:
data1.withColumnRenamed("country","Region").show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|        Region|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [56]:
data1.select("CustomerID","UnitPrice").show()

+----------+---------+
|CustomerID|UnitPrice|
+----------+---------+
|   17850.0|     2.55|
|   17850.0|     3.39|
|   17850.0|     2.75|
|   17850.0|     3.39|
|   17850.0|     3.39|
|   17850.0|     7.65|
|   17850.0|     4.25|
|   17850.0|     1.85|
|   17850.0|     1.85|
|   13047.0|     1.69|
|   13047.0|      2.1|
|   13047.0|      2.1|
|   13047.0|     3.75|
|   13047.0|     1.65|
|   13047.0|     4.25|
|   13047.0|     4.95|
|   13047.0|     9.95|
|   13047.0|     5.95|
|   13047.0|     5.95|
|   13047.0|     7.95|
+----------+---------+
only showing top 20 rows



In [57]:
data1.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']