In [1]:
# imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, min, max, col

In [2]:
# 1. Stop any zombie sessions first
if 'spark' in locals():
    spark.stop()

# 2. Re-initialize a fresh session
spark = SparkSession.builder \
    .appName("InstitutionsAnalysis") \
    .master("local[*]") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("qs-world-rankings-2025.csv", header=True, inferSchema=True)

# Register as a Temp View for Spark-SQL
df.createOrReplaceTempView("institutions")

df.show(5)

+---------+---------+--------------------+--------+--------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|2025 Rank|2024 Rank|    Institution Name|Location| Location Full|Size|Academic Reputation|Employer Reputation|Faculty Student|Citations per Faculty|International Faculty|International Students|International Research Network|Employment Outcomes|Sustainability|QS Overall Score|
+---------+---------+--------------------+--------+--------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|        1|        1|Massachusetts Ins...|      US| United States|   M|              100.0|              100.0|          100.0|                100.0|                 

#### i. How many Institutions are included in the dataset? 

In [3]:
df.count()

1503

#### ii. How many Institutions from ‘India' are included in dataset?

In [12]:
df.filter(df.Location == 'IN').count()

46

#### iii.Print the average "Citations per Faculty" for universities located in 'India'?

In [14]:
df.filter(df.Location == 'IN').select(avg("Citations per Faculty")).show()

+--------------------------+
|avg(Citations per Faculty)|
+--------------------------+
|         37.79130434782609|
+--------------------------+



#### iv. List Institutions where "International Students" percentage is 100 % along with their location ( "Location Full"). 

In [18]:
# Filtering for numeric 100
result = df.filter(df["International Students"] == 100) \
           .select("Institution Name", "Location Full")

# Displays all rows and shows the full text in every column
result.show(n=result.count(), truncate=False)

+----------------------------------------------------------+--------------------+
|Institution Name                                          |Location Full       |
+----------------------------------------------------------+--------------------+
|UCL                                                       |United Kingdom      |
|The University of Sydney                                  |Australia           |
|EPFL                                                      |Switzerland         |
|Monash University                                         |Australia           |
|The University of Queensland                              |Australia           |
|The London School of Economics and Political Science (LSE)|United Kingdom      |
|City University of Hong Kong                              |Hong Kong SAR       |
|University of St Andrews                                  |United Kingdom      |
|Hamad Bin Khalifa University                              |Qatar               |
|Maastricht Univ