In [306]:
# Import Numpy library
import numpy as np

* DATA INITIALIZATION

In [307]:
# List of score brackets as released by JAMB.
# This acts as labels for each score category.
score_ranges = [
    '320 and above', '300–319', '250–299', '200–249',
    '160–199', '140–159', '120–139', '100–119', 'Below 100'
]

In [308]:
# Number of candidates in each corresponding score range.
# We use np.array because it allows us to use vectorized operations for speed and efficiency.
candidate_counts = np.array([
    4756, 7658, 73441, 334560, 983187,
    488197, 57419, 3820, 2031
])

In [309]:
midpoints = np.array([
    330, 310, 275, 225, 180,
    150, 130, 110, 90
])

# This is a list of the middle values for each score range.
# We have to approximate the central score in each range.
# These values are used in weighted calculations like mean, variance, etc.

* BASIC PERCENTAGE CALCULATION

In [310]:
# Sums all values in the candidate_counts array.
# The result gives the total number of UTME candidates.
total_candidates = candidate_counts.sum()
print(f"Total Candidates: {total_candidates}\n")

Total Candidates: 1955069



In [311]:
# Converting raw counts into percentages.
# This helps to analyze proportion of candidates in each score range.
percentages = (candidate_counts / total_candidates) * 100

# Formats the heading for tabular output.
print("Score Range\t\tCandidates\tPercentage")
print("-" * 50)

for i in range(len(score_ranges)):
    print(f"{score_ranges[i]:<16}\t{candidate_counts[i]:>9}\t{percentages[i]:>8.2f}%")
    # Loop explanation:
    # Loops through each score range.
    # Displays score range, candidate count, and percentage.
    # :<16 left-aligns label, :>9 right-aligns numbers, :.2f formats percent to 2 decimal places.

Score Range		Candidates	Percentage
--------------------------------------------------
320 and above   	     4756	    0.24%
300–319         	     7658	    0.39%
250–299         	    73441	    3.76%
200–249         	   334560	   17.11%
160–199         	   983187	   50.29%
140–159         	   488197	   24.97%
120–139         	    57419	    2.94%
100–119         	     3820	    0.20%
Below 100       	     2031	    0.10%


### for i in range(len(score_ranges)):

len(score_ranges):
- This gets the total number of score ranges (which is 9 in this case).

range(len(score_ranges)):
- This creates a sequence of numbers: 0, 1, 2, ..., 8.

for i in ...:
- This means: “Repeat the next line of code 9 times — once for each index of score_ranges.”

i:
- Is just a number that keeps increasing each time through the loop.
- It starts at 0 and ends at 8.

### 2. print(f"...")
This is a formatted print statement. The f before the quotes lets us insert variables directly into the string using {}.

score_ranges[i]:<16

score_ranges[i]:
* Gets the score label, like "320 and above" for i = 0.

:<16:
- Means: Left-align the text and give it 16 spaces width.
- Keeps all labels nicely lined up.

candidate_counts[i]:>9

candidate_counts[i]:
- Gets the number of candidates in that score range.

:>9:
- Means: Right-align the number and give it 9 spaces width.
- Helps the numbers line up in a neat column.

percentages[i]:>8.2f

percentages[i]:
- Gets the percentage of candidates in that range.

:>8.2f:

 >: Right-align the percentage.

* 8: Give it 8 total spaces.
* .2f: Format it to 2 decimal places (like 23.45).

% (after the closing }):
* This is just a literal percent sign added to the output.

# WEIGHTED MEAN CALCULATION

In [None]:
weighted_mean = np.average(midpoints, weights=candidate_counts)
print(f"Weighted Mean Score: {weighted_mean:.2f}")

# Why use weighted mean? It reflects the true average score based on how many students scored in each range.
# Purpose: it calculates the average score considering how many candidates are in each bracket.
# We used weighted because each midpoint doesn’t have equal representation.
# weights=candidate_counts:
# This tells NumPy: “Don’t treat all the midpoints equally. Use the number of students in each score range as their weight.”
# candidate_counts = number of students in each score range.

Weighted Mean Score: 182.95


# VARIANCE AND STANDARD DEVIATION

In [313]:
mean_diffs_squared = (midpoints - weighted_mean) ** 2
# Purpose: Calculates the squared difference between each midpoint and the mean.

weighted_variance = np.average(mean_diffs_squared, weights=candidate_counts)
print(f"Variance: {weighted_variance:.2f}")
# Purpose: Gets the weighted average of squared differences — this is the variance.

weighted_std_dev = np.sqrt(weighted_variance)
print(f"Standard Deviation: {weighted_std_dev:.2f}")
# Purpose: Takes the square root of variance — this gives standard deviation.

Variance: 1113.90
Standard Deviation: 33.38


# WEIGHTED MEDIAN FUNCTION

In [314]:
def weighted_median(values, weights):
    sorted_indices = np.argsort(values)
    values = np.array(values)[sorted_indices]
    weights = np.array(weights)[sorted_indices]
    cum_weights = np.cumsum(weights)
    cutoff = weights.sum() / 2
    return values[np.where(cum_weights >= cutoff)[0][0]]
    # Purpose: Custom function to compute the median considering weight (number of candidates).
    # How it works:-
    # Sort values and weights together.
    # Find where cumulative weight exceeds half the total.
    # Return the corresponding value.

In [315]:
weighted_median_value = weighted_median(midpoints, candidate_counts)
print(f"Weighted Median Score: {weighted_median_value}")
# Result: Median score based on candidate distribution.

Weighted Median Score: 180


# MODE (MOST FREQUENT SCORE RANGE)

In [316]:
# Purpose: Finds the score range with the highest candidate count.
# np.argmax() returns the index of the highest value.
mode_index = np.argmax(candidate_counts)
mode_range = score_ranges[mode_index]
mode_midpoint = midpoints[mode_index]

print(f"Weighted Mode: {mode_range} (Midpoint: {mode_midpoint})")


Weighted Mode: 160–199 (Midpoint: 180)


In [317]:
# Purpose: Computes running totals of candidates and percentages.
# It is useful for percentile rank estimation and trend understanding.
cumulative_counts = np.cumsum(candidate_counts)
cumulative_percentages = np.cumsum(percentages)

# Prepares the table headers for cumulative distribution.
print("------ Cumulative Distribution ------")
print("Score Range\tCumulative Count\tCumulative %")
print("-" * 70)

# Prints cumulative distribution per score range, showing how totals build up.
for i in range(len(score_ranges)):
    print(f"{score_ranges[i]:<10}\t{cumulative_counts[i]:>14}\t{cumulative_percentages[i]:>14.2f}%")

------ Cumulative Distribution ------
Score Range	Cumulative Count	Cumulative %
----------------------------------------------------------------------
320 and above	          4756	          0.24%
300–319   	         12414	          0.63%
250–299   	         85855	          4.39%
200–249   	        420415	         21.50%
160–199   	       1403602	         71.79%
140–159   	       1891799	         96.76%
120–139   	       1949218	         99.70%
100–119   	       1953038	         99.90%
Below 100 	       1955069	        100.00%


# SKEWNESS AND KURTOSIS (APPROXIMATED)

In [318]:
skewness = 3 * (weighted_mean - weighted_median_value) / weighted_std_dev
print(f"Skewness (Pearson): {skewness:.2f}")

# Skewness Formula: Pearson’s second coefficient of skewness.
    # Interpretation:
    # Skew > 0 → right-skewed (more low scores)
    # Skew < 0 → left-skewed (more high scores)

Skewness (Pearson): 0.27


In [319]:
mean_central_4th = np.average((midpoints - weighted_mean)**4, weights=candidate_counts)
kurtosis = mean_central_4th / (weighted_variance**2)
print(f"Kurtosis (excess ~3 is normal): {kurtosis:.2f}")

# Kurtosis:
    # Measures "tailedness" of the distribution.
    # Kurtosis ≈ 3 is normal (bell-shaped).
    # 3 is heavy-tailed; <3 is light-tailed.

Kurtosis (excess ~3 is normal): 4.87


# Z-SCORE FOR TOP SCORE BRACKET

In [320]:
z_score_top = (330 - weighted_mean) / weighted_std_dev
print(f"Z-score for 330 score: {z_score_top:.2f}")
# Purpose: Standardizes 330 score.
# Z-score Meaning: Shows how far 330 is from the mean in terms of standard deviations.

Z-score for 330 score: 4.41


# HIGHEST & LOWEST REPRESENTATION

In [321]:
max_range = score_ranges[np.argmax(candidate_counts)]
min_range = score_ranges[np.argmin(candidate_counts)]
# Purpose: Identifies the score range with the most and fewest candidates.

print(f"Score Range with Most Candidates: {max_range}")
print(f"Score Range with Fewest Candidates: {min_range}\n")

Score Range with Most Candidates: 160–199
Score Range with Fewest Candidates: Below 100



# COLLATING RESULTS OF ALL THE STATISTICAL ANALYSIS

In [322]:
print("\n\n------ Statistical Summary ------")
print(f"Weighted Mean Score: {weighted_mean:.2f}")
print(f"Weighted Median Score: {weighted_median_value}")
print(f"Weighted Mode: {mode_range} (Midpoint: {mode_midpoint})")
print(f"Standard Deviation: {weighted_std_dev:.2f}")
print(f"Variance: {weighted_variance:.2f}")
print(f"Skewness (Pearson): {skewness:.2f}")
print(f"Kurtosis (excess ~3 is normal): {kurtosis:.2f}")
print(f"Z-score for 330 score: {z_score_top:.2f}")
print(f"Score Range with Most Candidates: {max_range}")
print(f"Score Range with Fewest Candidates: {min_range}\n")
print("------ Cumulative Distribution ------")
print("Score Range\tCumulative Count\tCumulative %")
print("-" * 60)
for i in range(len(score_ranges)):
    print(f"{score_ranges[i]:<16}\t{cumulative_counts[i]:>2}\t{cumulative_percentages[i]:>14.2f}%")



------ Statistical Summary ------
Weighted Mean Score: 182.95
Weighted Median Score: 180
Weighted Mode: 160–199 (Midpoint: 180)
Standard Deviation: 33.38
Variance: 1113.90
Skewness (Pearson): 0.27
Kurtosis (excess ~3 is normal): 4.87
Z-score for 330 score: 4.41
Score Range with Most Candidates: 160–199
Score Range with Fewest Candidates: Below 100

------ Cumulative Distribution ------
Score Range	Cumulative Count	Cumulative %
------------------------------------------------------------
320 and above   	4756	          0.24%
300–319         	12414	          0.63%
250–299         	85855	          4.39%
200–249         	420415	         21.50%
160–199         	1403602	         71.79%
140–159         	1891799	         96.76%
120–139         	1949218	         99.70%
100–119         	1953038	         99.90%
Below 100       	1955069	        100.00%
