In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Column / Row options
pd.options.display.max_columns = None
pd.options.display.max_rows = 20

# Two decimal places and the thousands separator
pd.options.display.float_format = '{:,.2f}'.format

# Import files

In [None]:
# Load the data frames
df_scores = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/Unpublished/df_scores.csv')
ky_spending_df = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/Unpublished/preprocessed_df.csv').loc[:,['End Year Code', 'School Code', 'Level Code', 'Reported Spending per student', 'Money Difference per school',]]

# Merge into one df
df = pd.merge(df_scores, ky_spending_df, on=['End Year Code', 'School Code', 'Level Code'], how='left')

# Sort df & Reset index
df.sort_values(by=['End Year', 'District Code', 'School Code'], inplace=True)
df.reset_index(drop=True, inplace=True)

# This predict_df will hold the predicted values.
predict_df = df.copy()


# Predict Classification: a NN Model

## Shuffle, Balance, Split, Scale

In [None]:
unscaled_df = predict_df.loc[:,['End Year Code', 'District Code', 'Level Code',
                                'Reported Spending per student', 'Money Difference per school',
                                'Proficiency Score', 'Classification Code', ]]

unscaled_df.dropna(axis =0, inplace=True)
unscaled_df.reset_index(drop=True, inplace=True)

#Shuffle df
unscaled_df = unscaled_df.sample(frac=1, random_state=15)

In [None]:
#Separate into input and targets
unscaled_inputs = unscaled_df.values[:,:-1]
targets = unscaled_df.values[:,-1]

# Split into training, validation, and test sets
unscaled_X_train, unscaled_X_set, y_train, y_set = train_test_split(unscaled_inputs, targets,
                                                                    test_size=0.2, random_state=15,
                                                                    stratify = targets)

unscaled_X_valid, unscaled_X_test, y_valid, y_test = train_test_split(unscaled_X_set, y_set,
                                                                      test_size=0.5, random_state=13,
                                                                      stratify = y_set)

In [None]:
# Scale data
epsb_scaler = StandardScaler()

# Calculate and store the mean and sd
epsb_scaler.fit(unscaled_X_train)

# Apply the scaler
scaled_X_train = epsb_scaler.transform(unscaled_X_train)
scaled_X_valid = epsb_scaler.transform(unscaled_X_valid)
scaled_X_test = epsb_scaler.transform(unscaled_X_test)

## Model

In [None]:
# Create the model
# Ran different rounds of the below model.  Some of the options I put in comments to the right.

input_size = 6
output_size = 3
hidden_layer_size = 15 # for this problem, optimal results were between 10-20 layers

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='tanh'),  # attempted to make this linear, and it only came to 77% valid accuracy or less
    tf.keras.layers.Dense(hidden_layer_size, activation='tanh'),
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Fit the model

batch_size = 50  # started with batch size 25
max_epochs = 100 # started with 20 epochs

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model.fit(scaled_X_train,
          y_train,
          batch_size=batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (scaled_X_valid, y_valid),
          verbose =2)

# Ran rounds using a multi-index of School Code & End Year.  But didn't get above a 78% validation accuracy

In [None]:
# test the model

test_loss, test_accuracy = model.evaluate(scaled_X_test, y_test)



In [None]:
# Print Pretty

print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.07. Test accuracy: 97.10%


In [None]:
# Save model
# Test accuracy for the saved model is 97%
tf.keras.models.save_model(model, '/content/drive/MyDrive/Colab Data/KY EPSB/model', include_optimizer=True)

## Predict Classification using model

In [None]:
# Grab the data that we want to predict
inputs = predict_df.loc[:, ['End Year Code', 'District Code', 'Level Code',
                            'Reported Spending per student', 'Money Difference per school', 'Proficiency Score', ]]

# Convert to np array and scale data
inputs = epsb_scaler.transform(inputs.values)

In [None]:
# Predict classification and convert to one column
pred_classification = model.predict(inputs).argmax(axis=1)

# Update the predict_df with the missing values in the Classification columns
predict_df['Classification Code'] = predict_df['Classification Code'].fillna(pd.Series(pred_classification))
predict_df['Classification'] = predict_df['Classification'].fillna(pd.Series(pred_classification)).replace([0, 1, 2], ['Needs Improvement', 'Proficient', 'Distinguished'])




# Predict Rating: a NN Model

## Shuffle, Split, Scale

In [None]:
nested_df = predict_df.loc[:,['End Year Code', 'Level Code',
                              'Reported Spending per student', 'Money Difference per school',
                              'Proficiency Score', 'Classification Code', 'Rating Code']]

nested_df.dropna(inplace=True)

#Shuffle data
nested_df = nested_df.sample(frac=1, random_state=7)
nested_df.reset_index(drop=True, inplace=True)

In [None]:
nested_inputs = nested_df.values[:,:-1]
nested_targets = nested_df.values[:,-1]

In [None]:
x_train, x_set, y_train, y_set = train_test_split(nested_inputs, nested_targets,
                                                  test_size=0.2, random_state=3, stratify = nested_targets)

x_valid, x_test, y_valid, y_test = train_test_split(x_set, y_set,
                                                    test_size=0.5, random_state=16, stratify = y_set)

In [None]:
# Scale data
nested_model_scaler = StandardScaler()

# Calculate and store the mean and sd
nested_model_scaler.fit(x_train)

# Apply the scaler
x_train_scaled = nested_model_scaler.transform(x_train)
x_valid_scaled = nested_model_scaler.transform(x_valid)
x_test_scaled = nested_model_scaler.transform(x_test)

In [None]:
nested_df = predict_df.loc[:,['End Year Code', #'District Code',
                              'Level Code',
                              'Reported Spending per student', 'Money Difference per school',
                              'Proficiency Score', 'Classification Code', 'Rating Code']]

nested_df.dropna(inplace=True)

#Shuffle data
nested_df = nested_df.sample(frac=1, random_state=7)
nested_df.reset_index(drop=True, inplace=True)

## Model

In [None]:
# Create the model
# Ran different rounds of the below model.  Some of the options I put in comments to the right.

input_size = 6
output_size = 5
hidden_layer_size = 10 # also tried 20, 5, 15, 30

nested_model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='PReLU'), # relu, elu, PReLU, LeakyReLU, swish     runnerup: swish, PReLU
    tf.keras.layers.Dense(hidden_layer_size, activation='elu'), # relu, elu, swish, PReLU, gelu, tanh      runnerup: elu, PReLU
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])

# Create a custom learning rate
custom_learning_rate = 0.005 # 0.01, 0.005, 0.001

# Compile the model
nested_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=custom_learning_rate),
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

# Fit the model
batch_size = 20  # 20, 50, 100, 30
max_epochs = 100

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

nested_model.fit(x_train_scaled, y_train,
          batch_size=batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (x_valid_scaled, y_valid),
          verbose = 2)


In [None]:
# test the model
nested_test_loss, nested_test_accuracy = nested_model.evaluate(x_test_scaled, y_test)

# Print Pretty
print('\nNested Test loss: {0:.2f}. Nested Test accuracy: {1:.2f}%'.format(nested_test_loss, nested_test_accuracy*100.))


Nested Test loss: 0.11. Nested Test accuracy: 97.22%


In [None]:
# Save model
# Test accuracy for the saved model is 96%
tf.keras.models.save_model(nested_model, '/content/drive/MyDrive/Colab Data/KY EPSB/nested_model', include_optimizer=True)

## Nested Predict

In [None]:
nested_inputs = predict_df.loc[:,['End Year Code', 'Level Code',
                              'Reported Spending per student', 'Money Difference per school',
                              'Proficiency Score', 'Classification Code', ]]

# Apply the scaler
nested_inputs = nested_model_scaler.transform(nested_inputs.values)

In [None]:
# Predict Ky Rating and convert to one column
predict_rating = nested_model.predict(nested_inputs).argmax(axis=1)

# Update the predict_df with the missing values in the Classification columns
predict_df['Rating Code'] = predict_df['Rating Code'].fillna(pd.Series(predict_rating))
predict_df['Rating'] = predict_df['Rating'].fillna(pd.Series(predict_rating)).replace([0,1,2,3,4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'])




# Export

In [None]:
predict_df

Unnamed: 0,End Year,End Year Code,District,District Code,School,School Code,Level,Level Code,Proficiency Score,Classification,Classification Code,Rating,Rating Code,Reported Spending per student,Money Difference per school
0,2012,0,Adair County,1,Adair County High School,1010,HS,2,55,Needs Improvement,0.00,Low,1.00,6460.00,-36400.00
1,2012,0,Adair County,1,John Adair Intermediate School,1013,ES,0,69,Proficient,1.00,Low,1.00,7200.00,-19900.00
2,2012,0,Adair County,1,Adair County Middle School,1014,MS,1,48,Needs Improvement,0.00,Very Low,0.00,6720.00,-55700.00
3,2012,0,Adair County,1,Adair County Elementary School,1016,ES,0,63,Proficient,1.00,Low,1.00,7260.00,-33800.00
4,2012,0,Allen County,5,Allen County Primary Center,5010,ES,0,50,Needs Improvement,0.00,Low,1.00,7930.00,190500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12718,2023,11,Woodford County,601,Simmons Elementary School,601075,ES,0,60,Needs Improvement,0.00,Medium,2.00,,
12719,2023,11,Woodford County,601,Woodford County High School,601084,HS,2,62,Needs Improvement,0.00,Medium,2.00,,
12720,2023,11,Woodford County,601,Woodford County Middle School,601085,MS,1,64,Needs Improvement,0.00,Medium,2.00,,
12721,2023,11,Woodford County,601,Huntertown Elementary School,601090,ES,0,70,Needs Improvement,0.00,Medium,2.00,,


In [None]:
predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12723 entries, 0 to 12722
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   End Year                       12723 non-null  int64  
 1   End Year Code                  12723 non-null  int64  
 2   District                       12723 non-null  object 
 3   District Code                  12723 non-null  int64  
 4   School                         12723 non-null  object 
 5   School Code                    12723 non-null  int64  
 6   Level                          12723 non-null  object 
 7   Level Code                     12723 non-null  int64  
 8   Proficiency Score              12723 non-null  int64  
 9   Classification                 12723 non-null  object 
 10  Classification Code            12723 non-null  float64
 11  Rating                         12723 non-null  object 
 12  Rating Code                    12723 non-null 

In [None]:
predict_df.nunique()

End Year                           10
End Year Code                      10
District                          174
District Code                     174
School                           1223
School Code                      1249
Level                               3
Level Code                          3
Proficiency Score                 101
Classification                      3
Classification Code                 3
Rating                              5
Rating Code                         5
Reported Spending per student    1665
Money Difference per school      3491
dtype: int64

In [None]:
predict_df.to_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/predict_df.csv', index = False)

In [None]:
predict_df.describe()

Unnamed: 0,End Year,End Year Code,District Code,School Code,Level Code,Proficiency Score,Classification Code,Rating Code,Reported Spending per student,Money Difference per school
count,12723.0,12723.0,12723.0,12723.0,12723.0,12723.0,12723.0,12723.0,11288.0,11386.0
mean,2016.87,4.87,287.92,288040.45,0.61,65.13,0.68,1.49,11271.71,21504.24
std,3.48,3.48,163.92,163945.55,0.77,12.91,0.85,0.99,4313.99,117681.25
min,2012.0,0.0,1.0,1010.0,0.0,13.0,0.0,0.0,2340.0,-666200.0
25%,2014.0,2.0,165.0,165012.0,0.0,57.0,0.0,1.0,8200.0,-19500.0
50%,2016.0,4.0,275.0,275119.0,0.0,65.0,0.0,1.0,10280.0,22800.0
75%,2019.0,7.0,435.0,435040.0,1.0,74.0,2.0,2.0,13290.0,58900.0
max,2023.0,11.0,601.0,601120.0,2.0,123.0,2.0,4.0,69010.0,1403000.0
