In [1]:
# imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, min, max, col

In [2]:
# 1. Stop any zombie sessions first
if 'spark' in locals():
    spark.stop()

# 2. Re-initialize a fresh session
spark = SparkSession.builder \
    .appName("InstitutionsAnalysis") \
    .master("local[*]") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("qs-world-rankings-2025.csv", header=True, inferSchema=True)

# Register as a Temp View for Spark-SQL
df.createOrReplaceTempView("institutions")

df.show(5)

+---------+---------+--------------------+--------+--------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|2025 Rank|2024 Rank|    Institution Name|Location| Location Full|Size|Academic Reputation|Employer Reputation|Faculty Student|Citations per Faculty|International Faculty|International Students|International Research Network|Employment Outcomes|Sustainability|QS Overall Score|
+---------+---------+--------------------+--------+--------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|        1|        1|Massachusetts Ins...|      US| United States|   M|              100.0|              100.0|          100.0|                100.0|                 

#### i. How many Institutions are included in the dataset? 

In [3]:
df.count()

1503

#### ii. How many Institutions from ‘India' are included in dataset?

In [12]:
df.filter(df.Location == 'IN').count()

46

#### iii.Print the average "Citations per Faculty" for universities located in 'India'?

In [14]:
df.filter(df.Location == 'IN').select(avg("Citations per Faculty")).show()

+--------------------------+
|avg(Citations per Faculty)|
+--------------------------+
|         37.79130434782609|
+--------------------------+



#### iv. List Institutions where "International Students" percentage is 100 % along with their location ( "Location Full"). 

In [18]:
# Filtering for numeric 100
result = df.filter(df["International Students"] == 100) \
           .select("Institution Name", "Location Full")

# Displays all rows and shows the full text in every column
result.show(n=result.count(), truncate=False)

+----------------------------------------------------------+--------------------+
|Institution Name                                          |Location Full       |
+----------------------------------------------------------+--------------------+
|UCL                                                       |United Kingdom      |
|The University of Sydney                                  |Australia           |
|EPFL                                                      |Switzerland         |
|Monash University                                         |Australia           |
|The University of Queensland                              |Australia           |
|The London School of Economics and Political Science (LSE)|United Kingdom      |
|City University of Hong Kong                              |Hong Kong SAR       |
|University of St Andrews                                  |United Kingdom      |
|Hamad Bin Khalifa University                              |Qatar               |
|Maastricht Univ

#### i. Recreate the Dataframe by Dropping all the rows where 'QS Overall Score' is mention as '-' and also convert it as float type.

In [20]:
from pyspark.sql.functions import col

# i. Drop rows where 'QS Overall Score' is '-' and convert to float
df_clean = df.filter(col("QS Overall Score") != "-") \
             .withColumn("QS Overall Score", col("QS Overall Score").cast("float"))

In [21]:
df_clean.show()

+---------+---------+--------------------+--------+----------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|2025 Rank|2024 Rank|    Institution Name|Location|   Location Full|Size|Academic Reputation|Employer Reputation|Faculty Student|Citations per Faculty|International Faculty|International Students|International Research Network|Employment Outcomes|Sustainability|QS Overall Score|
+---------+---------+--------------------+--------+----------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|        1|        1|Massachusetts Ins...|      US|   United States|   M|              100.0|              100.0|          100.0|                100.0|         

#### ii. Remove all the rows with any missing entry.

In [31]:
df_no_missing = df_clean.dropna()
df_no_missing.show()

+---------+---------+--------------------+--------+----------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|2025 Rank|2024 Rank|    Institution Name|Location|   Location Full|Size|Academic Reputation|Employer Reputation|Faculty Student|Citations per Faculty|International Faculty|International Students|International Research Network|Employment Outcomes|Sustainability|QS Overall Score|
+---------+---------+--------------------+--------+----------------+----+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|        1|        1|Massachusetts Ins...|      US|   United States|   M|              100.0|              100.0|          100.0|                100.0|         

#### iii. Convert all string columns into numeric values using StringIndexer transformer and make sure now DataFrame does not have any string columns anymore.

In [48]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StringType

# Identify string columns for indexing
string_cols = [field.name for field in df_no_missing.schema.fields 
               if isinstance(field.dataType, StringType) and field.name != "QS Overall Score"]

# Apply StringIndexer to each and drop original string columns
df_step3 = df_no_missing
for c in string_columns:
    indexer = StringIndexer(inputCol=c, outputCol=c + "_idx")
    df_step3 = indexer.fit(df_step3).transform(df_step3)

# Remove all old string columns
df_step3 = df_step3.drop(*string_cols)


In [49]:
# display the result
df_step3.show()

+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|Academic Reputation|Employer Reputation|Faculty Student|Citations per Faculty|International Faculty|International Students|International Research Network|Employment Outcomes|Sustainability|QS Overall Score|
+-------------------+-------------------+---------------+---------------------+---------------------+----------------------+------------------------------+-------------------+--------------+----------------+
|              100.0|              100.0|          100.0|                100.0|                 99.3|                  86.8|                          96.0|              100.0|          99.0|           100.0|
|               98.5|               99.5|           98.2|                 93.9|                100.0|                  99.6|                          97.4|             

#### iv. Using vectorAssembler combines all columns, except target column i.e. 'QS OverallScore', of spark DataFrame into single column (name it as features). Make sure DataFrame now contains only two columns, 'features' and 'QS Overall Score'.

In [50]:
from pyspark.ml.feature import VectorAssembler

# Get all columns except the target
feature_cols = [c for c in df_step3.columns if c != "QS Overall Score"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_step3).select("features", "QS Overall Score")

# Display proof of two columns
df_features.show(5)

+--------------------+----------------+
|            features|QS Overall Score|
+--------------------+----------------+
|[100.0,100.0,100....|           100.0|
|[98.5,99.5,98.2,9...|            98.5|
|[100.0,100.0,100....|            96.9|
|[100.0,100.0,96.3...|            96.8|
|[100.0,100.0,100....|            96.7|
+--------------------+----------------+
only showing top 5 rows



#### v. Split the vectorised Dataframe into training and test sets with one fifth records being held for testing.

In [51]:
train_set, test_set = df_features.randomSplit([0.8, 0.2], seed=42)

#### vi. Train default LinearRegression model with features as 'featuresCol' and ‘QS Overall Score’ as label on training set.


In [52]:
from pyspark.ml.regression import LinearRegression

# Initialize and train the model
lr = LinearRegression(featuresCol="features", labelCol="QS Overall Score")
lr_model = lr.fit(train_set)

#### vii. Perform prediction on the testing data and Print RMSE value.

In [53]:
from pyspark.ml.evaluation import RegressionEvaluator

# Make predictions
predictions = lr_model.transform(test_set)

# Evaluate the results
evaluator = RegressionEvaluator(labelCol="QS Overall Score", 
                                predictionCol="prediction", 
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 0.029073879251616835
