In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MySparkApp").config("spark.some.config.option", "config-value").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x7b1481b918d0>


CREATING A DATAFRAME


In [None]:
columns=["ROLLNO","NAME","DOB"]
data=[("101","AJAY","2004-10-30"),("102","ASWIN","2005-06-05"),("103","ANAS","2004-07-17"),("105","ANIRUDH","2004-05-18"),("106","ARAVIND","2004-08-07"),("107","ARUN","2002-12-17"),("108","DHARUN","2005-04-07"),("109","KABILAN","2003-09-05"),("110","BABU","2004-02-06")]
df=spark.createDataFrame(data,columns)
df.show()

+------+-------+----------+
|ROLLNO|   NAME|       DOB|
+------+-------+----------+
|   101|   AJAY|2004-10-30|
|   102|  ASWIN|2005-06-05|
|   103|   ANAS|2004-07-17|
|   105|ANIRUDH|2004-05-18|
|   106|ARAVIND|2004-08-07|
|   107|   ARUN|2002-12-17|
|   108| DHARUN|2005-04-07|
|   109|KABILAN|2003-09-05|
|   110|   BABU|2004-02-06|
+------+-------+----------+



In [None]:
df_with_date = df.withColumn("date_column", to_date(df["DOB"], "yyyy-MM-dd"))
df_with_date.show()

+------+-------+----------+-----------+
|ROLLNO|   NAME|       DOB|date_column|
+------+-------+----------+-----------+
|   101|   AJAY|2004-10-30| 2004-10-30|
|   102|  ASWIN|2005-06-05| 2005-06-05|
|   103|   ANAS|2004-07-17| 2004-07-17|
|   105|ANIRUDH|2004-05-18| 2004-05-18|
|   106|ARAVIND|2004-08-07| 2004-08-07|
|   107|   ARUN|2002-12-17| 2002-12-17|
|   108| DHARUN|2005-04-07| 2005-04-07|
|   109|KABILAN|2003-09-05| 2003-09-05|
|   110|   BABU|2004-02-06| 2004-02-06|
+------+-------+----------+-----------+



In [None]:
from pyspark.sql.functions import current_date, date_add,col,date_format
df_with_date = df.select(col("DOB"),date_format(col("DOB"), "yyyy-MMM-dd"))
df_with_date.show()

+----------+-----------------------------+
|       DOB|date_format(DOB, yyyy-MMM-dd)|
+----------+-----------------------------+
|2004-10-30|                  2004-Oct-30|
|2005-06-05|                  2005-Jun-05|
|2004-07-17|                  2004-Jul-17|
|2004-05-18|                  2004-May-18|
|2004-08-07|                  2004-Aug-07|
|2002-12-17|                  2002-Dec-17|
|2005-04-07|                  2005-Apr-07|
|2003-09-05|                  2003-Sep-05|
|2004-02-06|                  2004-Feb-06|
+----------+-----------------------------+



1.DISPLAY CURRENT TIME


In [None]:
from pyspark.sql.functions import current_date, date_add,current_timestamp
df = spark.range(1).select(current_date())
df.show(truncate=False)
df=spark.range(1).select(current_timestamp())
df.show()

+--------------+
|current_date()|
+--------------+
|2024-10-10    |
+--------------+

+--------------------+
| current_timestamp()|
+--------------------+
|2024-10-10 04:57:...|
+--------------------+



2.PRINT THE AGE OF ALL STUDENTS (in Years)


In [None]:
from pyspark.sql.functions import current_date, datediff, floor, col
df_with_age = df_with_date.withColumn("age", (datediff(current_date(), col("DOB")) / 365.25).cast("int"))
df_with_age.show(truncate=False)

+----------+-----------------------------+---+
|DOB       |date_format(DOB, yyyy-MMM-dd)|age|
+----------+-----------------------------+---+
|2004-10-30|2004-Oct-30                  |19 |
|2005-06-05|2005-Jun-05                  |19 |
|2004-07-17|2004-Jul-17                  |20 |
|2004-05-18|2004-May-18                  |20 |
|2004-08-07|2004-Aug-07                  |20 |
|2002-12-17|2002-Dec-17                  |21 |
|2005-04-07|2005-Apr-07                  |19 |
|2003-09-05|2003-Sep-05                  |21 |
|2004-02-06|2004-Feb-06                  |20 |
+----------+-----------------------------+---+



3.PRINT THE AGE OF ALL STUDENTS (in days)

In [None]:
df_with_age = df_with_date.withColumn("age", (datediff(current_date(), col("DOB"))).cast("int"))
df_with_age.show(truncate=False)

+----------+-----------------------------+----+
|DOB       |date_format(DOB, yyyy-MMM-dd)|age |
+----------+-----------------------------+----+
|2004-10-30|2004-Oct-30                  |7282|
|2005-06-05|2005-Jun-05                  |7064|
|2004-07-17|2004-Jul-17                  |7387|
|2004-05-18|2004-May-18                  |7447|
|2004-08-07|2004-Aug-07                  |7366|
|2002-12-17|2002-Dec-17                  |7965|
|2005-04-07|2005-Apr-07                  |7123|
|2003-09-05|2003-Sep-05                  |7703|
|2004-02-06|2004-Feb-06                  |7549|
+----------+-----------------------------+----+



4.FIND THE AVERAGE AGE IN THE CLASS

In [None]:
from pyspark.sql.functions import avg
avg_age = df_with_age.agg(avg("age").alias("average_age"))
avg_age.show()

+-----------------+
|      average_age|
+-----------------+
|7431.777777777777|
+-----------------+



5.FIND THE YOUNGER STUDENT IN THE CLASS


In [None]:
youngest_person = df_with_age.orderBy(col("age").asc()).limit(1)
youngest_person.show(truncate=False)

+------+----+----------+-----------+---+
|ROLLNO|NAME|DOB       |date_column|age|
+------+----+----------+-----------+---+
|101   |AJAY|2004-10-30|2004-10-30 |19 |
+------+----+----------+-----------+---+



6.FIND THE ELDER STUDENT IN THE CLASS

In [None]:
oldest_person = df_with_age.orderBy(col("age").desc()).limit(1)
oldest_person.show(truncate=False)

+------+----+----------+-----------+---+
|ROLLNO|NAME|DOB       |date_column|age|
+------+----+----------+-----------+---+
|107   |ARUN|2002-12-17|2002-12-17 |21 |
+------+----+----------+-----------+---+



7.FIND THE STUDENT WHOSE AGE IS ABOVE AVERAGE

In [None]:
avg_age = df_with_age.select(avg("age")).collect()[0][0]
students_above_avg_age = df_with_age.filter(col("age") > avg_age)
students_above_avg_age.show(truncate=False)

+------+-------+----------+-----------+---+
|ROLLNO|NAME   |DOB       |date_column|age|
+------+-------+----------+-----------+---+
|103   |ANAS   |2004-07-17|2004-07-17 |20 |
|105   |ANIRUDH|2004-05-18|2004-05-18 |20 |
|106   |ARAVIND|2004-08-07|2004-08-07 |20 |
|107   |ARUN   |2002-12-17|2002-12-17 |21 |
|109   |KABILAN|2003-09-05|2003-09-05 |21 |
|110   |BABU   |2004-02-06|2004-02-06 |20 |
+------+-------+----------+-----------+---+



8.FIND THE STUDENT WHOSE AGE IS BELOW AVERAGE

In [None]:
avg_age = df_with_age.select(avg("age")).collect()[0][0]
students_below_avg_age = df_with_age.filter(col("age") < avg_age)
students_below_avg_age.show(truncate=False)

+------+------+----------+-----------+---+
|ROLLNO|NAME  |DOB       |date_column|age|
+------+------+----------+-----------+---+
|101   |AJAY  |2004-10-30|2004-10-30 |19 |
|102   |ASWIN |2005-06-05|2005-06-05 |19 |
|108   |DHARUN|2005-04-07|2005-04-07 |19 |
+------+------+----------+-----------+---+



9.FIND THE STUDENTS BORN IN THE MONTH IN BETWEEN JANUARY AND MAY

In [None]:
from pyspark.sql.functions import col, month
df_with_month = df_with_date.withColumn("month", month(col("dob")))
df_with_month.show(truncate=False)
students = df_with_month.filter((col("month") >=1)&(col("month")<=5))
students.show(truncate=False)

+----------+-----------------------------+-----+
|DOB       |date_format(DOB, yyyy-MMM-dd)|month|
+----------+-----------------------------+-----+
|2004-10-30|2004-Oct-30                  |10   |
|2005-06-05|2005-Jun-05                  |6    |
|2004-07-17|2004-Jul-17                  |7    |
|2004-05-18|2004-May-18                  |5    |
|2004-08-07|2004-Aug-07                  |8    |
|2002-12-17|2002-Dec-17                  |12   |
|2005-04-07|2005-Apr-07                  |4    |
|2003-09-05|2003-Sep-05                  |9    |
|2004-02-06|2004-Feb-06                  |2    |
+----------+-----------------------------+-----+

+----------+-----------------------------+-----+
|DOB       |date_format(DOB, yyyy-MMM-dd)|month|
+----------+-----------------------------+-----+
|2004-05-18|2004-May-18                  |5    |
|2005-04-07|2005-Apr-07                  |4    |
|2004-02-06|2004-Feb-06                  |2    |
+----------+-----------------------------+-----+

