<a href="https://colab.research.google.com/github/sandeepgundeboina/LearningSpark/blob/main/SparkMaxOfColDuplicate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('SparkMaxOfColDuplicate').getOrCreate()

In [2]:
spark

In [3]:
simpleData=((100,'Mobile',5000,10),\
            (100,'Mobile',7000,7),\
            (300,'laptop',20000,5),\
            (200,'laptop',30000,4),\
            (200,'laptop',24000,12))
from pyspark.sql.types import *
defSchema=StructType([\
                      StructField('Product_id',IntegerType(),False),\
                      StructField('Product_name',StringType(),True),\
                      StructField('Price',IntegerType(),True),\
                      StructField('Discount',IntegerType(),True)])
df=spark.createDataFrame(simpleData,defSchema)
df.show()

+----------+------------+-----+--------+
|Product_id|Product_name|Price|Discount|
+----------+------------+-----+--------+
|       100|      Mobile| 5000|      10|
|       100|      Mobile| 7000|       7|
|       300|      laptop|20000|       5|
|       200|      laptop|30000|       4|
|       200|      laptop|24000|      12|
+----------+------------+-----+--------+



In [4]:
from pyspark.sql import Window
from pyspark.sql.functions import max,col
windowSpec=Window.partitionBy('Product_id')
dfMax=df.withColumn('maxPrice',max('Price').over(windowSpec)).withColumn('maxDiscount',max('Discount').over(windowSpec))
dfMax.show()

+----------+------------+-----+--------+--------+-----------+
|Product_id|Product_name|Price|Discount|maxPrice|maxDiscount|
+----------+------------+-----+--------+--------+-----------+
|       100|      Mobile| 5000|      10|    7000|         10|
|       100|      Mobile| 7000|       7|    7000|         10|
|       200|      laptop|30000|       4|   30000|         12|
|       200|      laptop|24000|      12|   30000|         12|
|       300|      laptop|20000|       5|   20000|          5|
+----------+------------+-----+--------+--------+-----------+



In [5]:
dfMax=dfMax.select(col('Product_id'),col('Product_name'),col('maxPrice').alias('Price'),col('maxDiscount').alias('Discount'))
dfMax.show()

+----------+------------+-----+--------+
|Product_id|Product_name|Price|Discount|
+----------+------------+-----+--------+
|       100|      Mobile| 7000|      10|
|       100|      Mobile| 7000|      10|
|       200|      laptop|30000|      12|
|       200|      laptop|30000|      12|
|       300|      laptop|20000|       5|
+----------+------------+-----+--------+



In [6]:
dfMax1=dfMax.drop_duplicates()
dfMax1.show()

+----------+------------+-----+--------+
|Product_id|Product_name|Price|Discount|
+----------+------------+-----+--------+
|       100|      Mobile| 7000|      10|
|       200|      laptop|30000|      12|
|       300|      laptop|20000|       5|
+----------+------------+-----+--------+



In [7]:
df2Max=dfMax.dropDuplicates()
df2Max.show()

+----------+------------+-----+--------+
|Product_id|Product_name|Price|Discount|
+----------+------------+-----+--------+
|       100|      Mobile| 7000|      10|
|       200|      laptop|30000|      12|
|       300|      laptop|20000|       5|
+----------+------------+-----+--------+



#### CREATING MAP

In [8]:
df.show()

+----------+------------+-----+--------+
|Product_id|Product_name|Price|Discount|
+----------+------------+-----+--------+
|       100|      Mobile| 5000|      10|
|       100|      Mobile| 7000|       7|
|       300|      laptop|20000|       5|
|       200|      laptop|30000|       4|
|       200|      laptop|24000|      12|
+----------+------------+-----+--------+



In [9]:
from pyspark.sql.functions import *

dfDict=df.select(col('Product_id'),col('Product_name'),col('Price'),col('Discount'),create_map(col('Product_name'),col('Price')).alias('Pdct_dic'))
dfDict.show()

+----------+------------+-----+--------+-----------------+
|Product_id|Product_name|Price|Discount|         Pdct_dic|
+----------+------------+-----+--------+-----------------+
|       100|      Mobile| 5000|      10| {Mobile -> 5000}|
|       100|      Mobile| 7000|       7| {Mobile -> 7000}|
|       300|      laptop|20000|       5|{laptop -> 20000}|
|       200|      laptop|30000|       4|{laptop -> 30000}|
|       200|      laptop|24000|      12|{laptop -> 24000}|
+----------+------------+-----+--------+-----------------+



In [10]:
dfDict1=df.select(col('Product_id'),col('Product_name'),col('Price'),col('Discount'),create_map(lit('Product_name'),col('Product_name'),lit('Price'),col('Price'),\
                                                                                                lit('Discount'),col('Discount')).alias('Pdct_dic'))
dfDict1.show(truncate=False)

+----------+------------+-----+--------+--------------------------------------------------------+
|Product_id|Product_name|Price|Discount|Pdct_dic                                                |
+----------+------------+-----+--------+--------------------------------------------------------+
|100       |Mobile      |5000 |10      |{Product_name -> Mobile, Price -> 5000, Discount -> 10} |
|100       |Mobile      |7000 |7       |{Product_name -> Mobile, Price -> 7000, Discount -> 7}  |
|300       |laptop      |20000|5       |{Product_name -> laptop, Price -> 20000, Discount -> 5} |
|200       |laptop      |30000|4       |{Product_name -> laptop, Price -> 30000, Discount -> 4} |
|200       |laptop      |24000|12      |{Product_name -> laptop, Price -> 24000, Discount -> 12}|
+----------+------------+-----+--------+--------------------------------------------------------+



In [11]:
df.printSchema()

root
 |-- Product_id: integer (nullable = false)
 |-- Product_name: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Discount: integer (nullable = true)



In [12]:
dfDict1.printSchema()

root
 |-- Product_id: integer (nullable = false)
 |-- Product_name: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Discount: integer (nullable = true)
 |-- Pdct_dic: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [13]:
from pyspark.sql.functions import explode

# Explode the map column to create rows of key-value pairs
exploded_df = dfDict1.select('Product_id', explode('Pdct_dic'))
exploded_df.show()

# Pivot the exploded data
pivoted_df = exploded_df.groupBy('Product_id').pivot('key').agg(first('value'))

pivoted_df.show(truncate=False)

+----------+------------+------+
|Product_id|         key| value|
+----------+------------+------+
|       100|Product_name|Mobile|
|       100|       Price|  5000|
|       100|    Discount|    10|
|       100|Product_name|Mobile|
|       100|       Price|  7000|
|       100|    Discount|     7|
|       300|Product_name|laptop|
|       300|       Price| 20000|
|       300|    Discount|     5|
|       200|Product_name|laptop|
|       200|       Price| 30000|
|       200|    Discount|     4|
|       200|Product_name|laptop|
|       200|       Price| 24000|
|       200|    Discount|    12|
+----------+------------+------+

+----------+--------+-----+------------+
|Product_id|Discount|Price|Product_name|
+----------+--------+-----+------------+
|100       |10      |5000 |Mobile      |
|200       |4       |30000|laptop      |
|300       |5       |20000|laptop      |
+----------+--------+-----+------------+



Array Split into Columns

In [14]:
from pyspark.sql import Row
from pyspark.sql.types import ArrayType, StringType

# Create some sample data
data = [Row(key="key1", values=["value1a", "value1b",'value1c']),
        Row(key="key2", values=["value2a", "value2b", "value2c"]),
        Row(key="key3", values=["value3a","value3b", "value3c"])]

# Define the schema
schema = StructType([
    StructField("key", StringType(), True),
    StructField("values", ArrayType(StringType()), True)
])

# Create the DataFrame
df_with_list = spark.createDataFrame(data, schema)

# Show the DataFrame
df_with_list.show(truncate=False)

+----+---------------------------+
|key |values                     |
+----+---------------------------+
|key1|[value1a, value1b, value1c]|
|key2|[value2a, value2b, value2c]|
|key3|[value3a, value3b, value3c]|
+----+---------------------------+



In [15]:
df_der=df_with_list.select('key',df_with_list.values[0],df_with_list.values[1],df_with_list.values[2])

In [16]:
df_der.show()

+----+---------+---------+---------+
| key|values[0]|values[1]|values[2]|
+----+---------+---------+---------+
|key1|  value1a|  value1b|  value1c|
|key2|  value2a|  value2b|  value2c|
|key3|  value3a|  value3b|  value3c|
+----+---------+---------+---------+



In [17]:
dfLis=df_with_list

In [19]:
dfsize=dfLis.select('key','values',size('values').alias('Size'))
dfsize.show()

+----+--------------------+----+
| key|              values|Size|
+----+--------------------+----+
|key1|[value1a, value1b...|   3|
|key2|[value2a, value2b...|   3|
|key3|[value3a, value3b...|   3|
+----+--------------------+----+



In [20]:
max_value=dfsize.agg({'Size':'max'}).collect()[0][0]
max_value

3

In [23]:
def arraySplitIntoCol(df,max_size):
    for i in range(max_size):
        df=df.withColumn(f"new_col_{i}",df.values[i])
    return df

In [24]:
dfOut=arraySplitIntoCol(dfLis,max_value)
dfOut.show()

+----+--------------------+---------+---------+---------+
| key|              values|new_col_0|new_col_1|new_col_2|
+----+--------------------+---------+---------+---------+
|key1|[value1a, value1b...|  value1a|  value1b|  value1c|
|key2|[value2a, value2b...|  value2a|  value2b|  value2c|
|key3|[value3a, value3b...|  value3a|  value3b|  value3c|
+----+--------------------+---------+---------+---------+



#####**END OF CODE**