## PySpark COde Practice

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

In [3]:
#Initial Saprk Session
spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()


In [4]:
data = [
    ("1/1/2023", "C1", 20),
    ("1/1/2023", "C2", 20),
    ("1/2/2023", "C2", 50),
    ("1/2/2023", "C3", 12),
    ("1/3/2023", "C4", 20),
    ("1/3/2023", "C5", 100),
    ("1/3/2023", "C1", 123),
]
column = ['Date_fld', 'Custome_name', 'Amount']

df = spark.createDataFrame(data, column)
df.show()

# Convert date in proper format
df = df.withColumn("Date_fld", to_date(col("Date_fld"), "M/d/yyyy"))
df.show()

# Define window partition by customer orderd by date
window_space = Window.partitionBy('Custome_name').orderBy('Date_fld')

#Assign row number within each customer number
df = df.withColumn('Row_Number', row_number().over(window_space))
df.show()

#filter only new customer
df_new_custome = df.filter(col('Row_Number') == 1)
df_new_custome.show()

# Count distinct new customers per date
result_df = df_new_custome.groupBy('Date_fld').agg(countDistinct('Custome_name').alias('Price_Count'))
result_df.show()




+--------+------------+------+
|Date_fld|Custome_name|Amount|
+--------+------------+------+
|1/1/2023|          C1|    20|
|1/1/2023|          C2|    20|
|1/2/2023|          C2|    50|
|1/2/2023|          C3|    12|
|1/3/2023|          C4|    20|
|1/3/2023|          C5|   100|
|1/3/2023|          C1|   123|
+--------+------------+------+

+----------+------------+------+
|  Date_fld|Custome_name|Amount|
+----------+------------+------+
|2023-01-01|          C1|    20|
|2023-01-01|          C2|    20|
|2023-01-02|          C2|    50|
|2023-01-02|          C3|    12|
|2023-01-03|          C4|    20|
|2023-01-03|          C5|   100|
|2023-01-03|          C1|   123|
+----------+------------+------+

+----------+------------+------+----------+
|  Date_fld|Custome_name|Amount|Row_Number|
+----------+------------+------+----------+
|2023-01-01|          C1|    20|         1|
|2023-01-03|          C1|   123|         2|
|2023-01-01|          C2|    20|         1|
|2023-01-02|          C2|    5

## Pyspark code to solve

In [6]:
#Craete Spark Session
spark2 = SparkSession\
.builder\
.config('spark.shuffle.useOldFetchers', 'true')\
.config('spark.ui.port','0')\
.config('spark.sql.warehouse.dir', '/user/itv008042/warehouse')\
.enableHiveSupport()\
.master('yarn')\
.appName('PySparkPractice')\
.getOrCreate()

#Create Schema

schema = StructType([
    StructField("ActorId", IntegerType(), True),
    StructField("DirectorId", IntegerType(), True),
    StructField("TimeStamp", IntegerType(), True)
])

data = [
    (1,1,0),
    (1,1,1),
    (1,1,2),
    (1,2,3),
    (1,2,4),
    (1,1,5),
    (1,1,6)
]

# Create Data frame
df = spark2.createDataFrame(data, schema)
df.show()


+-------+----------+---------+
|ActorId|DirectorId|TimeStamp|
+-------+----------+---------+
|      1|         1|        0|
|      1|         1|        1|
|      1|         1|        2|
|      1|         2|        3|
|      1|         2|        4|
|      1|         1|        5|
|      1|         1|        6|
+-------+----------+---------+



### Create Group By ActorId','DirectorId

In [9]:
df_group = df.groupBy('ActorId', 'DirectorId').count()
df_group.show()

+-------+----------+-----+
|ActorId|DirectorId|count|
+-------+----------+-----+
|      1|         1|    5|
|      1|         2|    2|
+-------+----------+-----+



In [10]:
df_group.filter(df_group['count']> 3).show()


+-------+----------+-----+
|ActorId|DirectorId|count|
+-------+----------+-----+
|      1|         1|    5|
+-------+----------+-----+

