# SAT Curve and Difficulty Prediction

Using a K Nearest Neighbors (```KNN```) classifier to generate a predicted curve and test difficulty prediction from all avaialble data.

Taking ```k = 3``` we find the closest SAT curves from past curves and fill in the unknown raw - scaled score conversions. Based on the difficulty of the compared curves, it also makes a prediction about the difficulty of the test i.e. was it "easy" or "hard"

It works by comparing available data with past curves and selecting the ```3``` closest curves and using their scaled scores to fill in the conversions unknown i.e. the original input remains intact and the unknown raw - scaled score conversions are predicted using the ```KNN``` classifier

In [1]:
import numpy as np
from datascience import *
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from statistics import mode,median
%matplotlib inline

In [2]:
# training data
math = Table().read_table("./data/mathtrain.csv")
reading = Table().read_table("./data/readingtrain.csv")
writing = Table().read_table("./data/writingtrain.csv")

In [3]:
# funcs

# we will use a k nearest neighbors classifier to predict a curve and test difficulty

def most_common_score(scores):
    """Returns most common score"""
    # number of occurences of each score
    score_dict = {}
    for score in scores:
        if score not in score_dict.keys():
            score_dict[score] = 1
        else:
            score_dict[score] += 1
    
    # if all have the same number of occurences, return the median.
    if list(score_dict.values()).count(1) == len(score_dict):
        return round(median(scores))
    else:
        return mode(scores)

def distance(array1, array2):
    """Calculates the distance between to sets of data."""
    return np.sqrt(sum((array1 - array2)**2))

def predicted_curve(table):
    """Calculates most common score from nearest neighbors."""
    
    curve = []
    for i in range(len(table.rows)-1): # length of all rows except last (class row) 
        options = []
        for j in table.columns:
            options.append(j[i])
        curve.append(mode(options))
        
    # Prediction whether test was easy, hard or normal
    pred = mode(list(table.row(-1)))
    if pred == 0:
        prediction = "Easy"
    elif pred == 0.5:
        prediction = "Normal"
    elif pred == 1:
        prediction = "Hard"
        
    
    return curve, prediction

# k = 3 since the training set is small and taking many curves into consideration will give innacurate results
def nearest(training, data, k=3):
    """k nearest neighbors classifier."""
    
    # extracting the raw scores column and then removing it
    raw_scores = training.column(0)[:-1]
    training = training.drop(0)
    
    # Comparing distances between scaled scores
    distances = []
    for t in training.columns:
        # for data in training set
        training_data = np.array([t[i] for i in range(len(data)) if not np.isnan(data[i])])
        
        # for input
        input_data = np.array([i for i in data if not np.isnan(i)])
        
        # calculating distance and appending to list
        distances.append(distance(training_data, input_data))
        
    # column indices
    indices = np.arange((len(training.columns)))
    
    # creating table of indices and distances
    table = Table().with_columns(
        "Index", indices,
        "Distance", distances
    )
                        
    # sorting table by distance
    table = table.sort("Distance")
    
    # indices of columns to compare
    to_search = list(table[0][:k])
    
    curve, difficulty = predicted_curve(training.select(to_search))
    
    # substituting known values
    substituted = [data[i] if str(data[i]) != "nan" else curve[i] for i in range(len(curve))]
            
    return Table().with_columns("Raw Score",raw_scores, "Scaled Score", substituted), difficulty

In [4]:
# May 2019 International Reading Curve and Difficulty Prediction from Partial Scoring
readingtest = Table().read_table("./data/readingtest.csv").columns[-1][:-1]
curve, difficulty = nearest(reading,readingtest)
curve.to_csv("./output/May_2019_Int_Predicted_Reading.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Normal


Raw Score,Scaled Score
52,40
51,38
50,37
49,36
48,35
47,34
46,33
45,33
44,32
43,32


In [5]:
# May 2019 International Writing Curve and Difficulty Prediction from Partial Scoring
writingtest = Table().read_table("./data/writingtest.csv").columns[-1][:-1]
curve, difficulty = nearest(writing,writingtest)
curve.to_csv("./output/May_2019_Int_Predicted_Writing.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Easy


Raw Score,Scaled Score
44,40
43,36
42,35
41,34
40,33
39,32
38,31
37,30
36,29
35,29


In [6]:
# May 2019 International Math Curve and Difficulty Prediction from Partial Scoring
mathtest = Table().read_table("./data/mathtest.csv").columns[-1][:-1]
curve, difficulty = nearest(math,mathtest)
curve.to_csv("./output/May_2019_Int_Predicted_Math.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Easy


Raw Score,Scaled Score
58,800
57,770
56,750
55,730
54,710
53,700
52,680
51,670
50,670
49,670


# April 13 School Day Predictions

In [7]:
curves = Table().read_table("./data/ap13.csv")
mathtest= curves.column(1)
readingtest = curves.column(2)
writingtest = curves.column(3)

In [8]:
curve, difficulty = nearest(math, mathtest)
curve.to_csv("./output/april_13_2021/math_pred.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Normal


Raw Score,Scaled Score
58,800
57,790
56,780
55,770
54,750
53,730
52,720
51,700
50,690
49,690


In [9]:
curve, difficulty = nearest(reading, readingtest)
curve.to_csv("./output/april_13_2021/reading_pred.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Normal


Raw Score,Scaled Score
52,40
51,39
50,39
49,38
48,37
47,36
46,36
45,35
44,34
43,34


In [10]:
curve, difficulty = nearest(writing, writingtest)
curve.to_csv("./output/april_13_2021/writing_pred.csv")
print("Test Difficulty:", difficulty)
curve

Test Difficulty: Easy


Raw Score,Scaled Score
44,40
43,39
42,37
41,36
40,35
39,35
38,34
37,33
36,33
35,32
