In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import lit


spark = SparkSession.builder.appName("example").getOrCreate()
sc = spark.sparkContext

# Load the data
rdd = sc.textFile('./StudentData.csv')
headers = rdd.first()
rdd = rdd.filter(lambda x: x!=headers)
rdd = rdd.map(lambda x: x.split(','))

In [13]:
headers

'age,gender,name,course,roll,marks,email'

In [48]:
# Showing the number of students in the file
rdd.count()

1000

In [50]:
# Show the total marks archived by male and female student
rdd1 = rdd
rdd1 = rdd1.map(lambda x: (x[1], int(x[5])))
rdd1 = rdd1.reduceByKey(lambda x,y: x + y)
rdd1.collect()

[('Female', 29636), ('Male', 30461)]

In [51]:
# Show the total number student that have passed and failed. 50+ marks are required to pass
rdd2 = rdd
rdd2 = rdd2.map(lambda x: (("pass", 1) if int(x[5]) > 50 else ("fail", 1)))
rdd2 = rdd2.reduceByKey(lambda x, y: x + y)
rdd2.collect()

[('fail', 370), ('pass', 630)]

In [52]:
# Show the total number of students enrolled by per courses
rdd3 = rdd
rdd3 = rdd3.map(lambda x: (x[3], 1))
rdd3 = rdd3.reduceByKey(lambda x, y: x + y)
rdd3.collect()

[('PF', 166),
 ('DSA', 176),
 ('DB', 157),
 ('Cloud', 192),
 ('MVC', 157),
 ('OOP', 152)]

In [54]:
# Show the total marks that students have achieved per course
rdd4 = rdd
rdd4 = rdd4.map(lambda x: (x[3], int(x[5])))
rdd4 = rdd4.reduceByKey(lambda x, y: x + y)
rdd4.collect()

[('PF', 9933),
 ('DSA', 10950),
 ('DB', 9270),
 ('Cloud', 11443),
 ('MVC', 9585),
 ('OOP', 8916)]

In [55]:
# Show the minimum and maximum achieved per courses
rdd5 = rdd
rdd5 = rdd5.map(lambda x: (x[3], int(x[5])))
rdd5 = rdd5.reduceByKey(lambda x, y: x if x > y else y)
rdd5.collect()

[('PF', 99), ('DSA', 99), ('DB', 98), ('Cloud', 99), ('MVC', 99), ('OOP', 99)]

In [None]:
# Show the average age of male and female students
rdd6 = rdd
rdd6 = rdd6.map(lambda x: (x[1], (int(x[0]), 1))) # this format is (gender, (sum, count))
rdd6 = rdd6.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
rdd6.mapValues(lambda x: x[0]/x[1]).collect()

[('Female', 28.489021956087825), ('Male', 28.52304609218437)]