In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("CountryLogic").getOrCreate()

data = [
    ("Ram", 30, "Russia"),
    ("Radha", 31, "Russia,Norway"),
    ("Kannu", 35, "Norway,Belgium")
]

df = spark.createDataFrame(data, ["Name", "Age", "Country"])

# 1️⃣ Get all unique countries
all_countries = (
    df.select(F.explode(F.split("Country", ",")))
      .distinct()
      .agg(F.collect_set("col").alias("AllCountries"))
)

all_countries.show(truncate=False)

# 2️⃣ Split visited countries
df2 = df.withColumn("VisitedCountryArr", F.split("Country", ","))
df2.show()

# 3️⃣ Cross join to get all countries for comparison
df3 = df2.crossJoin(all_countries)
df3.show(truncate=False)

# 4️⃣ Find non visited countries
result = df3.withColumn(
    "NonVisitedCountry",
    F.array_join(
        F.array_except(F.col("AllCountries"), F.col("VisitedCountryArr")),
        ","
    )
).select(
    "Name",
    F.col("Country").alias("VisitedCountry"),
    "NonVisitedCountry"
)

result.show(truncate=False)

# result.show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("NoCrossJoin").getOrCreate()

data = [
    ("Ram", 30, "Russia"),
    ("Radha", 31, "Russia,Norway"),
    ("Kannu", 35, "Norway,Belgium")
]

df = spark.createDataFrame(data, ["Name", "Age", "Country"])

# 1️⃣ Get all unique countries as a Python list
all_countries = (
    df.select(F.explode(F.split("Country", ",")))
      .distinct()
      .rdd.flatMap(lambda x: x)
      .collect()
)

# 2️⃣ Create visited & non-visited columns
result = (
    df.withColumn("VisitedArr", F.split("Country", ","))
      .withColumn(
          "NonVisitedCountry",
          F.array_join(
              F.array_except(
                  F.array(*[F.lit(c) for c in all_countries]),
                  F.col("VisitedArr")
              ),
              ","
          )
      )
      .select(
          "Name",
          F.col("Country").alias("VisitedCountry"),
          "NonVisitedCountry"
      )
)

result.show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, split, explode, collect_set,
    array_except, concat_ws
)
from pyspark.sql.window import Window
spark = SparkSession.builder.getOrCreate()
data = [
    ("Ram", 30, "Russia"),
    ("Radha", 31, "Russia,Norway"),
    
    ("Kannu", 35, "Norway,Belgium")
]

df = spark.createDataFrame(data, ["Name", "Age", "CountryName"])
df1 = df.withColumn("VisitedCountryArr", split(col("CountryName"), ","))
df_exploded = df1.withColumn("country", explode(col("VisitedCountryArr")))
w = Window.partitionBy()



In [None]:
df_with_all = df_exploded.withColumn(
    "AllCountries",
    collect_set("country").over(w)
)
df_with_all.show(truncate=False)

In [None]:
df2 =df_with_all.dropDuplicates(["Name"])
df2.show(truncate=False)

In [None]:
df2 = df2.withColumn(
    "NonVisitedCountryArr",
    array_except(col("AllCountries"), col("VisitedCountryArr"))
)

df2.select('Name','VisitedCountryArr','NonVisitedCountryArr').show()

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
 
spark = SparkSession.builder.appName("test").getOrCreate()

data =[("Ram",30,"Russia"),
	   ("Radha",31,"Russia,Norway"),
	   ("Kannu",35,"Norway,Belgium")
	   ]
df = spark.createDataFrame(data,["Name","age","Country"])

df = df.withColumnRenamed("Country","visiting_country")

df = df.withColumn("country",F.explode(F.split("visiting_country",',')))

all_countries = df.select(F.explode(F.split("Country", ",")))\
      .distinct()\
      .rdd.flatMap(lambda x: x)\
      .collect()
df = df.withColumn("all_country",F.lit(all_countries))

df = df.dropDuplicates(subset=['Name'])

df = df.withColumn(
    "NonVisitedCountryArr",
    F.array_except(F.col("all_country"), F.split(F.col("visiting_country"),','))
)
df = df.drop("country", "all_country")



In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("test").getOrCreate()

data = [
    ("Ram", 30, "Russia"),
    ("Radha", 31, "Russia,Norway"),
    ("Kannu", 35, "Norway,Belgium")
]

df = spark.createDataFrame(data, ["Name", "age", "Country"])
df = df.withColumnRenamed("Country", "visiting_country")

all_countries = (
    df.select(F.explode(F.split("visiting_country", ",")))
      .distinct()
      .rdd.flatMap(lambda x: x)
      .collect()
)
df = df.withColumn("all_country", F.lit(all_countries))

df = df.withColumn(
    "NonVisitedCountryArr",
    F.array_join(F.array_except(
        F.col("all_country"),
        F.split(F.col("visiting_country"), ",")
    ),',')
)

df = df.drop("age","all_country")
df.show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

players_data = [
    ("Sachin-IND", 18694, "93/49"),
    ("Ricky-AUS", 11274, "66/31"),
    ("Lara-WI", 10222, "45/21"),
    ("Rahul-IND", 10355, "95/11"),
    ("Jhonty-SA", 7051, "43/5"),
    ("Hayden-AUS", 8722, "67/19")
]

players_df = spark.createDataFrame(
    players_data,
    ["player", "runs", "50s/100s"]
)

countries_data = [
    ("IND", "India"),
    ("AUS", "Australia"),
    ("WI", "WestIndies"),
    ("SA", "SouthAfrica")
]

countries_df = spark.createDataFrame(
    countries_data,
    ["SRT", "country"]
)


In [None]:
players_df = players_df.withColumn('SRT',F.split(F.col('player'),'-')[1])
players_df = players_df.withColumn('player_name',F.split(F.col('player'),'-')[0])
players_df.show()

In [None]:
players_df = players_df.withColumn('Sum',F.split('50s/100s','/')[0].cast('int')+F.split('50s/100s','/')[1].cast('int'))

In [None]:
join_df = players_df.join(countries_df,on='SRT',how='inner')
# join_df.select('player_name','country','runs','sum').show()
join_df.filter(F.col('sum')>90)\
    .select('player_name','country','runs','sum').show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

data = [
    (100, "IT", 100, "2024-05-12"),
    (200, "IT", 100, "2024-06-12"),
    (100, "FIN", 400, "2024-07-12"),
    (300, "FIN", 500, "2024-07-12"),
    (300, "FIN", 1543, "2024-07-12"),
    (300, "FIN", 1500, "2024-07-12")
]

schema = StructType([
    StructField("empid", IntegerType(), True),
    StructField("dept", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("date", StringType(), True)
])

df = spark.createDataFrame(data, schema)

df1 =df.groupBy('empid')\
  .count()\
  .filter(F.col("count")==1)

In [16]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("test").getOrCreate()

data = [('A',1),('A',2),('A',3),
		('B',1),('B',2),('B',3),
		('C',1),('C',2),('C',3)]
columns = ['id','number']

df = spark.createDataFrame(data=data,schema= columns)

df1 = df.groupBy('id').agg(F.collect_list(F.col('number')).alias("numbers"))\
        .withColumn("values",F.array_join(F.col('numbers'),','))\
        .withColumn("new_nums",F.lit(df.id =='A'))\
        .withColumn("condtion",F.when(F.col("new_nums")=='True',"yes").otherwise("no"))\
        .withColumn("convert_list",F.split(F.col('values'),','))
df1.show()

+---+---------+------+--------+--------+------------+
| id|  numbers|values|new_nums|condtion|convert_list|
+---+---------+------+--------+--------+------------+
|  A|[1, 2, 3]| 1,2,3|    true|     yes|   [1, 2, 3]|
|  B|[1, 2, 3]| 1,2,3|   false|      no|   [1, 2, 3]|
|  C|[1, 2, 3]| 1,2,3|   false|      no|   [1, 2, 3]|
+---+---------+------+--------+--------+------------+

