Set Up Environment

In [1]:
!pip install pyspark



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LargeDatasetAnalysis") \
    .getOrCreate()


In [4]:
spark


In [5]:
df = spark.read.csv("/content/marks.csv", header=True, inferSchema=True)
df.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Marks: integer (nullable = true)
 |-- Grades: string (nullable = true)



**1. Display Top 3 Rows of the Dataset**

In [6]:
df.show(3)

+-------+-----+------+
|   Name|Marks|Grades|
+-------+-----+------+
|Priyang|   98|    AA|
| Aadhya|   89|    AB|
| Krisha|   99|    AA|
+-------+-----+------+
only showing top 3 rows



**2.Display Datatypes of Each Column**

In [7]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Marks: integer (nullable = true)
 |-- Grades: string (nullable = true)



**3.Display column Names**

In [9]:
df.columns

['Name', 'Marks', 'Grades']

**4.Count Number of Rows and Columns of the DataSet**

In [10]:
df.count()

7

In [13]:
len (df.columns)

3

**5. Get Overall Statistics of the Dataset**

In [14]:
df.describe().show()

+-------+------+------------------+------+
|summary|  Name|             Marks|Grades|
+-------+------+------------------+------+
|  count|     7|                 7|     7|
|   mean|  NULL| 89.71428571428571|  NULL|
| stddev|  NULL|6.6761836831702395|  NULL|
|    min|Aadhya|                82|    AA|
|    max|Vedant|                99|    BB|
+-------+------+------------------+------+



**6.Find Unique values available in Grades Column**

In [15]:
df.columns

['Name', 'Marks', 'Grades']

In [16]:
df.toPandas()['Grades'].unique()

array(['AA', 'AB', 'AC', 'BA', 'BB'], dtype=object)

**7.Find the total number of Unique values available in Grades Column**

In [17]:
len (df.toPandas()['Grades'].unique())

5

**8.How to Select Single Column**

In [18]:
df.show()

+-------+-----+------+
|   Name|Marks|Grades|
+-------+-----+------+
|Priyang|   98|    AA|
| Aadhya|   89|    AB|
| Krisha|   99|    AA|
| Vedant|   87|    AB|
| Parshv|   90|    AC|
| Mittal|   83|    BA|
|Archana|   82|    BB|
+-------+-----+------+



In [20]:
df.select('Grades').show()

+------+
|Grades|
+------+
|    AA|
|    AB|
|    AA|
|    AB|
|    AC|
|    BA|
|    BB|
+------+



**9. How to Select Multiple Columns**

In [21]:
df.select(['Name','Marks']).show()

+-------+-----+
|   Name|Marks|
+-------+-----+
|Priyang|   98|
| Aadhya|   89|
| Krisha|   99|
| Vedant|   87|
| Parshv|   90|
| Mittal|   83|
|Archana|   82|
+-------+-----+



**10. Create New Column with Marks+1**

In [28]:
df.withColumn('New_Marks',df.Marks+1).show()

+-------+-----+------+---------+
|   Name|Marks|Grades|New_Marks|
+-------+-----+------+---------+
|Priyang|   98|    AA|       99|
| Aadhya|   89|    AB|       90|
| Krisha|   99|    AA|      100|
| Vedant|   87|    AB|       88|
| Parshv|   90|    AC|       91|
| Mittal|   83|    BA|       84|
|Archana|   82|    BB|       83|
+-------+-----+------+---------+



**11.Rename Name Column and Give New Name "Student_Name"**

In [29]:
df.withColumnRenamed('Name','Student_Name').show()

+------------+-----+------+
|Student_Name|Marks|Grades|
+------------+-----+------+
|     Priyang|   98|    AA|
|      Aadhya|   89|    AB|
|      Krisha|   99|    AA|
|      Vedant|   87|    AB|
|      Parshv|   90|    AC|
|      Mittal|   83|    BA|
|     Archana|   82|    BB|
+------------+-----+------+



**12.Display the Name of the students having marks greater than 90**

In [30]:
df.filter(df['Marks']>90).show()

+-------+-----+------+
|   Name|Marks|Grades|
+-------+-----+------+
|Priyang|   98|    AA|
| Krisha|   99|    AA|
+-------+-----+------+



**13.Display Average marks of students**

In [31]:
df.groupBy('Grades').mean().show()

+------+----------+
|Grades|avg(Marks)|
+------+----------+
|    AA|      98.5|
|    BA|      83.0|
|    AB|      88.0|
|    AC|      90.0|
|    BB|      82.0|
+------+----------+



**14.Sort Everyrow of Dataset in Descending Order**

In [33]:
df.orderBy(df['Marks'].desc()).show()

+-------+-----+------+
|   Name|Marks|Grades|
+-------+-----+------+
| Krisha|   99|    AA|
|Priyang|   98|    AA|
| Parshv|   90|    AC|
| Aadhya|   89|    AB|
| Vedant|   87|    AB|
| Mittal|   83|    BA|
|Archana|   82|    BB|
+-------+-----+------+

