In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master('local').getOrCreate()

### PySpark 1.0

In [None]:
from pyspark import SparkConf, SparkContext

In [None]:
conf = SparkConf()
conf.setMaster('local')

In [None]:
sc = SparkContext.getOrCreate(conf)

In [None]:
print(sc.appName)

### After PySpark 2.0

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master('local').appName('Spark_Practice').getOrCreate()

In [None]:
df = spark.createDataFrame(data=[('Akshay', 20), ('Bhushan', 24), ('Tushar', 25)]).toDF('Name', 'Age')

In [None]:
type(df)

In [None]:
df.show(truncate=False)

### Basic PySpark Operations : 

reading/writing files, filtering data, adding new columns, removing duplicate records, sorting dataframe, selecting specific columns

In [None]:
ecom_df = spark.read.csv('/content/ecommerce_data.csv', header=True)

In [None]:
type(ecom_df)

In [None]:
ecom_df.show(5, truncate=False)

In [None]:
ecom_df.select("country").distinct().show(5, truncate=False)

In [None]:
ecom_df.filter("country == 'Sweden'")

In [None]:
ecom_df.where("country != 'Sweden'")

In [None]:
filtered_ecom_df = ecom_df.where("country != 'Sweden'")

In [None]:
filtered_ecom_df.show()

In [None]:
filtered_ecom_df.count()

### Spark SQL

In [None]:
filtered_ecom_df.createOrReplaceTempView("filtered_ecom_df_view")

In [None]:
spark.sql("select *, count(*) from filtered_ecom_df_view group by InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country having count(*) > 1").show()

In [None]:
spark.sql("select * from (select *, count(*) as count from filtered_ecom_df_view group by InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country)foo where count > 1").show(truncate=False)

#### Performing same operation as above with PySpark code

In [None]:
from pyspark.sql.functions import col, floor, trunc

In [None]:
filtered_ecom_df.columns

In [None]:
duplicate_rec_df = filtered_ecom_df.groupBy(filtered_ecom_df.columns).count().filter(col('count')>1).select("InvoiceNo","StockCode")

#### Adding new column in existing DataFrame : withColumn()

In [None]:
filtered_ecom_df.withColumn("total_amount", col('Quantity')*col('UnitPrice')).show(truncate=False)