<a href="https://colab.research.google.com/github/rosehelfrich/Ky_School_data/blob/main/Predict_Classification%2C_and_Highly_Impacted_Schools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Column / Row options
pd.options.display.max_columns = None
pd.options.display.max_rows = 20

# Two decimal places and the thousands separator
pd.options.display.float_format = '{:,.2f}'.format

# KY df - load and polish

In [4]:
# Load the data frames
df_scores = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/df_scores.csv')
ky_spending_df = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/preprocessed_df.csv')

# Select only the necessary columns from ky_spending_df
ky_spending_df = ky_spending_df.loc[:,['End Year', 'School Code', 'Level',
                                       'Reported Spending per student', 'Money Difference per school',
                                       ]]


In [5]:
# Merge into one df
df = pd.merge(df_scores, ky_spending_df, on=['End Year', 'School Code', 'Level'], how='left')

# Sort df & Reset index
df.sort_values(by=['End Year', 'District', 'School Code'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Create columns with the codes
df['Classification Code'] = df['Classification']
df['Classification Code'].replace(['Needs Improvement', 'Proficient', 'Distinguished'], [0, 1, 2], inplace=True)

df['Level Code'] = df['Level']
df['Level Code'].replace(['ES', 'MS', 'HS'], [0, 1, 2], inplace=True)

df['End Year Code'] = df['End Year'] - 2012

df['Rating'] = df['Rating Code']
df['Rating'].replace([0,1,2,3,4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'], inplace=True)



In [6]:
# Reorder the columns of df_scores
reordered_columns = ['End Year', 'End Year Code',
                     'District', 'District Code',
                     'School', 'School Code',
                     'Level', 'Level Code',
                     'Reported Spending per student', 'Money Difference per school',
                     'Proficiency Rate',
                     'Classification','Classification Code',
                     'Rating', 'Rating Code',]

df = df[reordered_columns]

In [7]:
# Round
df = df.round({'Proficiency Rate': 0, 'Reported Spending per student': -1, 'Money Difference per school': -2})

In [8]:
# This df will hold the predicted values

predict_df = df.copy()

# Predict Classification: a NN Model

## Shuffle, Balance, Split, Scale

In [9]:
unscaled_df = predict_df.loc[:,['End Year Code', 'District Code', 'Level Code',
                                'Reported Spending per student', 'Money Difference per school',
                                'Proficiency Rate', 'Classification Code', ]]

unscaled_df.dropna(axis =0, inplace=True)
unscaled_df.reset_index(drop=True, inplace=True)

#Shuffle df
unscaled_df = unscaled_df.sample(frac=1, random_state=15)

In [10]:
#Separate into input and targets
unscaled_inputs = unscaled_df.values[:,:-1]
targets = unscaled_df.values[:,-1]

# Split into training, validation, and test sets
unscaled_X_train, unscaled_X_set, y_train, y_set = train_test_split(unscaled_inputs, targets,
                                                                    test_size=0.2, random_state=15,
                                                                    stratify = targets)

unscaled_X_valid, unscaled_X_test, y_valid, y_test = train_test_split(unscaled_X_set, y_set,
                                                                      test_size=0.5, random_state=13,
                                                                      stratify = y_set)

In [11]:
# Scale data
epsb_scaler = StandardScaler()

# Calculate and store the mean and sd
epsb_scaler.fit(unscaled_X_train)

# Apply the scaler
scaled_X_train = epsb_scaler.transform(unscaled_X_train)
scaled_X_valid = epsb_scaler.transform(unscaled_X_valid)
scaled_X_test = epsb_scaler.transform(unscaled_X_test)

## Model

In [12]:
# Create the model
# Ran different rounds of the below model.  Some of the options I put in comments to the right.

input_size = 6
output_size = 3
hidden_layer_size = 15 # for this problem, optimal results were between 10-20 layers

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='tanh'),  # attempted to make this linear, and it only came to 77% valid accuracy or less
    tf.keras.layers.Dense(hidden_layer_size, activation='tanh'),
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [13]:
# Fit the model

batch_size = 50  # started with batch size 25
max_epochs = 100 # started with 20 epochs

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model.fit(scaled_X_train,
          y_train,
          batch_size=batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (scaled_X_valid, y_valid),
          verbose =2)

# Ran rounds using a multi-index of School Code & End Year.  But didn't get above a 78% validation accuracy

Epoch 1/100
100/100 - 3s - loss: 0.7957 - accuracy: 0.6630 - val_loss: 0.6092 - val_accuracy: 0.7452 - 3s/epoch - 28ms/step
Epoch 2/100
100/100 - 1s - loss: 0.5366 - accuracy: 0.7602 - val_loss: 0.4753 - val_accuracy: 0.8016 - 728ms/epoch - 7ms/step
Epoch 3/100
100/100 - 1s - loss: 0.4432 - accuracy: 0.8048 - val_loss: 0.4187 - val_accuracy: 0.8323 - 539ms/epoch - 5ms/step
Epoch 4/100
100/100 - 1s - loss: 0.4013 - accuracy: 0.8209 - val_loss: 0.3903 - val_accuracy: 0.8516 - 576ms/epoch - 6ms/step
Epoch 5/100
100/100 - 1s - loss: 0.3784 - accuracy: 0.8367 - val_loss: 0.3741 - val_accuracy: 0.8468 - 581ms/epoch - 6ms/step
Epoch 6/100
100/100 - 0s - loss: 0.3641 - accuracy: 0.8361 - val_loss: 0.3628 - val_accuracy: 0.8500 - 494ms/epoch - 5ms/step
Epoch 7/100
100/100 - 1s - loss: 0.3527 - accuracy: 0.8425 - val_loss: 0.3548 - val_accuracy: 0.8548 - 545ms/epoch - 5ms/step
Epoch 8/100
100/100 - 1s - loss: 0.3441 - accuracy: 0.8480 - val_loss: 0.3448 - val_accuracy: 0.8581 - 837ms/epoch - 8ms

<keras.callbacks.History at 0x7e2aeff8b9d0>

In [14]:
# test the model

test_loss, test_accuracy = model.evaluate(scaled_X_test, y_test)



In [15]:
# Print Pretty

print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.08. Test accuracy: 96.61%


In [16]:
# Save model
# Test accuracy for the saved model is 96%
#tf.keras.models.save_model(model, '/content/drive/MyDrive/Colab Data/KY EPSB/model', include_optimizer=True)

## Predict Classification using model

In [17]:
# Grab the data that we want to predict
inputs = predict_df.loc[:,['End Year Code', 'District Code', 'Level Code',
                                'Reported Spending per student', 'Money Difference per school',
                                'Proficiency Rate', ]]

# Convert to np array
# scale data
inputs = epsb_scaler.transform(inputs.values)

In [18]:
# Use the model to predict the values
pred_multi_col = model.predict(inputs)

#Convert to one column
pred_classification = pred_multi_col.argmax(axis=1)

# Save predictions to predict_df
predict_df['Classification Code'] = pred_classification



# Predict Rating: a NN Model

## Shuffle, Split, Scale

In [19]:
nested_df = predict_df.loc[:,['End Year Code', #'District Code',
                              'Level Code',
                              'Reported Spending per student', 'Money Difference per school',
                              'Proficiency Rate', 'Classification Code', 'Rating Code']]

nested_df.dropna(inplace=True)

#Shuffle data
nested_df = nested_df.sample(frac=1, random_state=7)
nested_df.reset_index(drop=True, inplace=True)

In [20]:
nested_inputs = nested_df.values[:,:-1]
nested_targets = nested_df.values[:,-1]

In [21]:
x_train, x_set, y_train, y_set = train_test_split(nested_inputs, nested_targets,
                                                  test_size=0.2, random_state=3, stratify = nested_targets)

x_valid, x_test, y_valid, y_test = train_test_split(x_set, y_set,
                                                    test_size=0.5, random_state=16, stratify = y_set)

In [22]:
# Scale data
nested_model_scaler = StandardScaler()

# Calculate and store the mean and sd
nested_model_scaler.fit(x_train)

# Apply the scaler
x_train_scaled = nested_model_scaler.transform(x_train)
x_valid_scaled = nested_model_scaler.transform(x_valid)
x_test_scaled = nested_model_scaler.transform(x_test)

## Model

In [23]:
# Create the model
# Ran different rounds of the below model.  Some of the options I put in comments to the right.

input_size = 6
output_size = 5
hidden_layer_size = 10 # also tried 20, 5, 15, 30

nested_model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='PReLU'), # relu, elu, PReLU, LeakyReLU, swish     runnerup: swish, PReLU
    tf.keras.layers.Dense(hidden_layer_size, activation='elu'), # relu, elu, swish, PReLU, gelu, tanh      runnerup: elu, PReLU
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])

# Create a custom learning rate
custom_learning_rate = 0.005 # 0.01, 0.005, 0.001

# Compile the model
nested_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=custom_learning_rate),
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

# Fit the model
batch_size = 20  # 20, 50, 100, 30
max_epochs = 100

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

nested_model.fit(x_train_scaled, y_train,
          batch_size=batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (x_valid_scaled, y_valid),
          verbose = 2)


Epoch 1/100
102/102 - 1s - loss: 1.3238 - accuracy: 0.4392 - val_loss: 0.9031 - val_accuracy: 0.6680 - 1s/epoch - 13ms/step
Epoch 2/100
102/102 - 0s - loss: 0.6929 - accuracy: 0.7453 - val_loss: 0.5322 - val_accuracy: 0.7708 - 221ms/epoch - 2ms/step
Epoch 3/100
102/102 - 0s - loss: 0.5029 - accuracy: 0.8086 - val_loss: 0.4345 - val_accuracy: 0.8221 - 218ms/epoch - 2ms/step
Epoch 4/100
102/102 - 0s - loss: 0.4358 - accuracy: 0.8131 - val_loss: 0.3834 - val_accuracy: 0.8261 - 218ms/epoch - 2ms/step
Epoch 5/100
102/102 - 0s - loss: 0.3943 - accuracy: 0.8383 - val_loss: 0.3562 - val_accuracy: 0.8617 - 246ms/epoch - 2ms/step
Epoch 6/100
102/102 - 0s - loss: 0.3590 - accuracy: 0.8457 - val_loss: 0.3280 - val_accuracy: 0.8498 - 209ms/epoch - 2ms/step
Epoch 7/100
102/102 - 0s - loss: 0.3372 - accuracy: 0.8561 - val_loss: 0.3171 - val_accuracy: 0.8735 - 214ms/epoch - 2ms/step
Epoch 8/100
102/102 - 0s - loss: 0.3121 - accuracy: 0.8739 - val_loss: 0.2815 - val_accuracy: 0.8854 - 261ms/epoch - 3ms

<keras.callbacks.History at 0x7e2add5ed7e0>

In [24]:
# test the model
nested_test_loss, nested_test_accuracy = nested_model.evaluate(x_test_scaled, y_test)

# Print Pretty
print('\nNested Test loss: {0:.2f}. Nested Test accuracy: {1:.2f}%'.format(nested_test_loss, nested_test_accuracy*100.))


Nested Test loss: 0.11. Nested Test accuracy: 95.65%


In [25]:
# Save model
# Test accuracy for the saved model is 95%
# tf.keras.models.save_model(nested_model, '/content/drive/MyDrive/Colab Data/KY EPSB/nested_model', include_optimizer=True)

## Nested Predict

In [26]:
nested_inputs = predict_df.loc[:,['End Year Code', #'District Code',
                              'Level Code',
                              'Reported Spending per student', 'Money Difference per school',
                              'Proficiency Rate', 'Classification Code', ]]

# Apply the scaler
nested_inputs = nested_model_scaler.transform(nested_inputs.values)

# Predict Ky Rating
predict_rate_multi = nested_model.predict(nested_inputs)

# Convert to one column
predict_rating = predict_rate_multi.argmax(axis=1)

# Save predictions to predict_df
predict_df['Rating Code'] = predict_rating



# Prep & export df

In [27]:
#Fill in the missing values, and convert to words

predict_df.loc[predict_df['Rating'].isna(), 'Rating'] = predict_df['Rating Code']
predict_df.loc[predict_df['Classification'].isna(), 'Classification'] = predict_df['Classification Code']

predict_df['Classification'].replace([0, 1, 2], ['Needs Improvement', 'Proficient', 'Distinguished'], inplace=True)
predict_df['Rating'].replace([0,1,2,3,4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'], inplace=True)

In [28]:
# Convert to categories for data visualization
cat_columns = ['End Year', 'End Year Code',
               'District Code', 'District',
               'Level', 'Level Code',
               'Classification', 'Classification Code',
               'Rating', 'Rating Code']

predict_df[cat_columns] = predict_df[cat_columns].astype('category')

In [29]:
#predict_df.to_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/predict_df.csv', index = False)

# Most Impacted schools

In [30]:
def major_impacts(df, percentile):
    # Making sure the upper and lower is higher and lower, respectively
    upper, lower = (percentile, round(1 - percentile, 2)) if percentile >= (1 - percentile) else (round(1 - percentile, 2), percentile)
    cols = ['Money Difference per school', 'Proficiency Rate']

    # For each column per year, get the literal values of the upper / lower by using the percents
    quantiles = df.groupby('End Year')[cols].apply(lambda x: x.quantile([upper, lower]))

    impacted_schools = pd.DataFrame()

    # Selects the rows with the corresponding year, and the literal high/ low values
    for year in df['End Year'].unique():
        year_df = df[df['End Year'] == year]
        high = quantiles.loc[(year, upper)]
        low = quantiles.loc[(year, lower)]

        # Uses the high / low values to select the rows that are on the outside of those values
        # Does this for each column x.name is the column name.  The output is a T/F column
        not_between_mask = ~year_df[cols].apply(lambda x: x.between(low[x.name], high[x.name], inclusive='neither'))

        # Takes the T/F column and selects the row for any row that has T.
        selected_rows = not_between_mask.any(axis=1)
        impacted_schools = pd.concat([impacted_schools, year_df[selected_rows]])

    return impacted_schools.reset_index(drop=True)


In [31]:
major_impacts(predict_df, 0.95)

Unnamed: 0,End Year,End Year Code,District,District Code,School,School Code,Level,Level Code,Reported Spending per student,Money Difference per school,Proficiency Rate,Classification,Classification Code,Rating,Rating Code
0,2012,0,Allen County,5,Allen County Primary Center,5010,ES,0,7930.00,190500.00,50.00,Needs Improvement,0,Very Low,0
1,2012,0,Allen County,5,Allen County-Scottsville High School,5020,HS,2,8500.00,157000.00,51.00,Needs Improvement,0,Low,1
2,2012,0,Anchorage Independent,6,Anchorage Independent Public School,6010,MS,1,19850.00,50100.00,92.00,Distinguished,2,High,3
3,2012,0,Ashland Independent,12,Hager Elementary School,12080,ES,0,6450.00,31900.00,76.00,Distinguished,2,Medium,2
4,2012,0,Barren County,21,North Jackson Elementary,21014,ES,0,7390.00,31400.00,72.00,Distinguished,2,Low,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2270,2022,10,Warren County,571,Warren Central High School,571210,HS,2,15500.00,-201800.00,49.00,Needs Improvement,0,Low,1
2271,2022,10,Warren County,571,Greenwood High School,571230,HS,2,14070.00,222800.00,74.00,Proficient,1,High,3
2272,2022,10,Whitley County,591,Whitley County East Elementary School,591062,ES,0,17400.00,-116100.00,85.00,Distinguished,2,Very High,4
2273,2022,10,Wolfe County,595,Red River Valley Elementary School,595205,ES,0,19690.00,-4400.00,33.00,Needs Improvement,0,Very Low,0
